{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Préambule : nos biais inconscients"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nous vous proposons, si vous le souhaitez, de prendre une dizaine de minutes pour tester vos biais inconscients:\n",
"\n",
"https://implicit.harvard.edu/implicit/canadafr/takeatest.html\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TD 2: Manipulation des données"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use same python env as in TD1\n",
"python 3.10 with the following packages\n",
"```\n",
"numpy==1.25\n",
"fairlearn==0.9.0\n",
"plotly==5.24.1\n",
"nbformat==5.10.4\n",
"ipykernel==6.29.5\n",
"```\n",
"\n",
"plus a new one\n",
"```\n",
"aif360[\"inFairness\"]==0.6.1\n",
"causal-learn==0.1.4.0\n",
"```\n",
"\n",
"\n",
"You with also need to have R installed \n",
"`sudo apt install r-base-core`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting aif360[inFairness]==0.6.1\n",
" Using cached aif360-0.6.1-py3-none-any.whl (259 kB)\n",
"Requirement already satisfied: pandas>=0.24.0 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from aif360[inFairness]==0.6.1) (2.2.3)\n",
"Collecting matplotlib\n",
" Downloading matplotlib-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.6/8.6 MB\u001b[0m \u001b[31m28.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting scikit-learn>=1.0\n",
" Downloading scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.5/13.5 MB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting scipy>=1.2.0\n",
" Downloading scipy-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (40.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.6/40.6 MB\u001b[0m \u001b[31m35.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: numpy>=1.16 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from aif360[inFairness]==0.6.1) (2.1.3)\n",
"Collecting skorch\n",
" Downloading skorch-1.1.0-py3-none-any.whl (228 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.9/228.9 kB\u001b[0m \u001b[31m55.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting inFairness>=0.2.2\n",
" Using cached inFairness-0.2.3-py3-none-any.whl (45 kB)\n",
"Collecting torch>=1.13.0\n",
" Downloading torch-2.5.1-cp310-cp310-manylinux1_x86_64.whl (906.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m906.4/906.4 MB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting POT>=0.8.0\n",
" Downloading POT-0.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (865 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m865.6/865.6 kB\u001b[0m \u001b[31m23.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pytz>=2020.1 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from pandas>=0.24.0->aif360[inFairness]==0.6.1) (2024.2)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from pandas>=0.24.0->aif360[inFairness]==0.6.1) (2.9.0.post0)\n",
"Requirement already satisfied: tzdata>=2022.7 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from pandas>=0.24.0->aif360[inFairness]==0.6.1) (2024.2)\n",
"Collecting threadpoolctl>=3.1.0\n",
" Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)\n",
"Collecting joblib>=1.2.0\n",
" Using cached joblib-1.4.2-py3-none-any.whl (301 kB)\n",
"Collecting cycler>=0.10\n",
" Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
"Requirement already satisfied: packaging>=20.0 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from matplotlib->aif360[inFairness]==0.6.1) (24.1)\n",
"Collecting pyparsing>=2.3.1\n",
" Downloading pyparsing-3.2.1-py3-none-any.whl (107 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m107.7/107.7 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting contourpy>=1.0.1\n",
" Downloading contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m325.0/325.0 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pillow>=8\n",
" Downloading pillow-11.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.5/4.5 MB\u001b[0m \u001b[31m51.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting kiwisolver>=1.3.1\n",
" Downloading kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m86.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting fonttools>=4.22.0\n",
" Downloading fonttools-4.55.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting tqdm>=4.14.0\n",
" Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.5/78.5 kB\u001b[0m \u001b[31m26.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting tabulate>=0.7.7\n",
" Using cached tabulate-0.9.0-py3-none-any.whl (35 kB)\n",
"Requirement already satisfied: six>=1.5 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas>=0.24.0->aif360[inFairness]==0.6.1) (1.16.0)\n",
"Collecting nvidia-curand-cu12==10.3.5.147\n",
" Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl (56.3 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.3/56.3 MB\u001b[0m \u001b[31m32.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cufft-cu12==11.2.1.3\n",
" Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl (211.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.5/211.5 MB\u001b[0m \u001b[31m5.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cuda-nvrtc-cu12==12.4.127\n",
" Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m24.6/24.6 MB\u001b[0m \u001b[31m41.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting networkx\n",
" Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting nvidia-nccl-cu12==2.21.5\n",
" Downloading nvidia_nccl_cu12-2.21.5-py3-none-manylinux2014_x86_64.whl (188.7 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m188.7/188.7 MB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting fsspec\n",
" Downloading fsspec-2024.12.0-py3-none-any.whl (183 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m183.9/183.9 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting sympy==1.13.1\n",
" Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.2/6.2 MB\u001b[0m \u001b[31m75.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cudnn-cu12==9.1.0.70\n",
" Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m664.8/664.8 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cusolver-cu12==11.6.1.9\n",
" Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl (127.9 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m127.9/127.9 MB\u001b[0m \u001b[31m27.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-nvjitlink-cu12==12.4.127\n",
" Downloading nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m76.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-nvtx-cu12==12.4.127\n",
" Downloading nvidia_nvtx_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (99 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m42.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting filelock\n",
" Downloading filelock-3.16.1-py3-none-any.whl (16 kB)\n",
"Collecting nvidia-cuda-runtime-cu12==12.4.127\n",
" Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m883.7/883.7 kB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127\n",
" Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.8/13.8 MB\u001b[0m \u001b[31m91.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: typing-extensions>=4.8.0 in /home/vincent/.cache/pypoetry/virtualenvs/test-sm-QFV3hVXP-py3.10/lib/python3.10/site-packages (from torch>=1.13.0->inFairness>=0.2.2->aif360[inFairness]==0.6.1) (4.12.2)\n",
"Collecting jinja2\n",
" Downloading jinja2-3.1.5-py3-none-any.whl (134 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.6/134.6 kB\u001b[0m \u001b[31m45.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cublas-cu12==12.4.5.8\n",
" Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m363.4/363.4 MB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting nvidia-cusparse-cu12==12.3.1.170\n",
" Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl (207.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m207.5/207.5 MB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting triton==3.1.0\n",
" Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (209.5 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.5/209.5 MB\u001b[0m \u001b[31m8.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting mpmath<1.4,>=1.1.0\n",
" Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)\n",
"Collecting MarkupSafe>=2.0\n",
" Downloading MarkupSafe-3.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20 kB)\n",
"Installing collected packages: mpmath, tqdm, threadpoolctl, tabulate, sympy, scipy, pyparsing, pillow, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, networkx, MarkupSafe, kiwisolver, joblib, fsspec, fonttools, filelock, cycler, contourpy, triton, scikit-learn, POT, nvidia-cusparse-cu12, nvidia-cudnn-cu12, matplotlib, jinja2, skorch, nvidia-cusolver-cu12, aif360, torch, inFairness\n",
"Successfully installed MarkupSafe-3.0.2 POT-0.9.5 aif360-0.6.1 contourpy-1.3.1 cycler-0.12.1 filelock-3.16.1 fonttools-4.55.3 fsspec-2024.12.0 inFairness-0.2.3 jinja2-3.1.5 joblib-1.4.2 kiwisolver-1.4.8 matplotlib-3.10.0 mpmath-1.3.0 networkx-3.4.2 nvidia-cublas-cu12-12.4.5.8 nvidia-cuda-cupti-cu12-12.4.127 nvidia-cuda-nvrtc-cu12-12.4.127 nvidia-cuda-runtime-cu12-12.4.127 nvidia-cudnn-cu12-9.1.0.70 nvidia-cufft-cu12-11.2.1.3 nvidia-curand-cu12-10.3.5.147 nvidia-cusolver-cu12-11.6.1.9 nvidia-cusparse-cu12-12.3.1.170 nvidia-nccl-cu12-2.21.5 nvidia-nvjitlink-cu12-12.4.127 nvidia-nvtx-cu12-12.4.127 pillow-11.1.0 pyparsing-3.2.1 scikit-learn-1.6.1 scipy-1.15.1 skorch-1.1.0 sympy-1.13.1 tabulate-0.9.0 threadpoolctl-3.5.0 torch-2.5.1 tqdm-4.67.1 triton-3.1.0\n",
"\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n",
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n"
]
}
],
"source": [
"!pip install aif360[\"inFairness\"]==0.6.1\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In this TD we will use data from the [Medical Expenditure Panel Survey](https://meps.ahrq.gov/mepsweb/). The TD is inspired from [AIF360 tutorial](https://github.com/Trusted-AI/AIF360/blob/main/examples/tutorial_medical_expenditure.ipynb)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Download the dataset\n",
"\n",
"Use the command lines below, if you encounter a problem do not hesitate to call for help\n",
"\n",
"``` \n",
"python_bin_path=\"$(which python)\"; \\\n",
"meps_path=\"$(dirname $python_bin_path})\"; \\\n",
"cd $meps_path; \\\n",
"cd ../lib/python3.10/site-packages/aif360/data/raw/meps; \\\n",
"Rscript generate_data.R\n",
"```\n",
"\n",
"It will ask to read the rules and restrictions to download and use this dataset.\n",
"This is because the dataset is a medical dataset witl real person information.\n",
"\n",
"The download can take a bit of time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import numpy as np\n",
"import pandas as pd\n",
"import plotly.express as px\n",
"import warnings\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"warnings.simplefilter(action='ignore', append=True, category=UserWarning)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Datasets\n",
"from aif360.datasets import MEPSDataset19\n",
"from aif360.datasets import MEPSDataset20\n",
"from aif360.datasets import MEPSDataset21\n",
"\n",
"MEPSDataset19_data = MEPSDataset19()\n",
"# (dataset_orig_panel19_train,\n",
"# dataset_orig_panel19_val,\n",
"# dataset_orig_panel19_test) = MEPSDataset19_data.split([0.5, 0.8], shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"(dataset_orig_panel19_train,\n",
" dataset_orig_panel19_val,\n",
" dataset_orig_panel19_test) = MEPSDataset19().split([0.5, 0.8], shuffle=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7915, 4749, 3166)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(dataset_orig_panel19_train.instance_weights), len(dataset_orig_panel19_val.instance_weights), len(dataset_orig_panel19_test.instance_weights)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Nous vous conseillons d'aller voir les pages [MEPSDataset19](https://aif360.readthedocs.io/en/latest/modules/generated/aif360.datasets.MEPSDataset19.html) et [AIF360 tutorial](https://github.com/Trusted-AI/AIF360/blob/main/examples/tutorial_medical_expenditure.ipynb) pour mieux comprendre le dataset.\n",
"\n",
"Ce qu'il faut avoir lu:\n",
"- **The sensitive attribute is 'RACE' :1 is privileged, 0 is unprivileged** ; It is constructed as follows: 'Whites' (privileged class) defined by the features RACEV2X = 1 (White) and HISPANX = 2 (non Hispanic); 'Non-Whites' that included everyone else.\n",
"(The features 'RACEV2X', 'HISPANX' etc are removed, and replaced by the 'RACE')\n",
"- **'UTILIZATION' is the outcome (the label to predict for a ML model) 0 is positive 1 is negative**. It is a binary composite feature, created to measure the total number of trips requiring some sort of medical care, it sum up the following features (that are removed from the data):\n",
" * OBTOTV15(16), the number of office based visits\n",
" * OPTOTV15(16), the number of outpatient visits\n",
" * ERTOT15(16), the number of ER visits\n",
" * IPNGTD15(16), the number of inpatient nights\n",
" * HHTOTD16, the number of home health visits\n",
"UTILISATION is set to 1 when te sum is above or equal to 10, else it is set to 0\n",
"- **The dataset is weighted** The dataset come with an 'instance_weights' attribute that corresponds to the feature perwt15f these weights are supposed to generate estimates that are representative of the United State (US) population in 2015.\n",
"\n",
"\n",
"Ce qu'il faut avoir retenu:\n",
"- **The sensitive attribute is 'RACE' :1 is privileged, 0 is unprivileged**\n",
"- **'UTILIZATION' is the outcome (the label to predict for a ML model) 0 is positive 1 is negative**\n",
"- **The dataset is weighted**\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([21854.981705, 18169.604822, 17191.832515, ..., 3896.116219,\n",
" 4883.851005, 6630.588948])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"instance_weights = MEPSDataset19_data.instance_weights\n",
"instance_weights\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Taille du dataset 15830, poids total du dataset 141367240.546316.'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"f\"Taille du dataset {len(instance_weights)}, poids total du dataset {instance_weights.sum()}.\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Premier appercu du dataset\n",
"\n",
"La librairie AIF360 fournie une surcouche au dataset, cela le rend un peu moins intuitif d'utilisation (par exemple pour étudier/visualiser les attributs un à un), mais elle permet de calculer les métrique des fairness en une ligne de commande."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:No module named 'tensorflow': AdversarialDebiasing will be unavailable. To install, run:\n",
"pip install 'aif360[AdversarialDebiasing]'\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:No module named 'tensorflow': AdversarialDebiasing will be unavailable. To install, run:\n",
"pip install 'aif360[AdversarialDebiasing]'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.49826823461176517\n"
]
}
],
"source": [
"from aif360.metrics import BinaryLabelDatasetMetric\n",
"from aif360.metrics import ClassificationMetric\n",
"\n",
"metric_orig_panel19_train = BinaryLabelDatasetMetric(\n",
" MEPSDataset19_data,\n",
" unprivileged_groups=[{'RACE': 0}],\n",
" privileged_groups=[{'RACE': 1}])\n",
"\n",
"print(metric_orig_panel19_train.disparate_impact())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Cependant le but de ce TD étant encore de manipuler les données et de les analyser nous allons revenir aux données sous forme d'un dataframe.\n",
"\n",
"Note pour calculer les métriques de fairness sans avoir à les réimplémenter dans le cas pondéré (instances weights) vous pouvez utiliser les méthodes implémenter dans AIF360 là [Implémentation Métriques de Fairness](https://aif360.readthedocs.io/en/latest/modules/sklearn.html#module-aif360.sklearn.metrics)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Conversion en un dataframe\n",
"\n",
"Nous avons vu que la somme des poids est conséquente, pres de 115millions nous ne pouvons donc pas raisonneblement dupliqué chaque ligne autant de fois que son poids.\n",
"\n",
"Nous allons stocker la pondération et la prendre en compte ensuite dans notre analyse"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"def get_df(MepsDataset):\n",
" data = MepsDataset.convert_to_dataframe()\n",
" # data_train est un tuple, avec le data_frame et un dictionnaire avec toutes les infos (poids, attributs sensibles etc)\n",
" df = data[0]\n",
" df['WEIGHT'] = data[1]['instance_weights']\n",
" return df\n",
"\n",
"df = get_df(MEPSDataset19_data)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1.1848351529675123, 0.7849286063696154)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from aif360.sklearn.metrics import disparate_impact_ratio, base_rate\n",
"dir = disparate_impact_ratio(\n",
" y_true=df.UTILIZATION, \n",
" prot_attr=df.RACE, \n",
" pos_label=0,\n",
" sample_weight=df.WEIGHT)\n",
"br =base_rate(\n",
" y_true=df.UTILIZATION, \n",
" pos_label=0,\n",
" sample_weight=df.WEIGHT)\n",
"dir,br"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1.1746792888264614, 0.8283006948831333)"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dir = disparate_impact_ratio(\n",
" y_true=df.UTILIZATION, \n",
" prot_attr=df.RACE, \n",
" pos_label=0)\n",
"br =base_rate(\n",
" y_true=df.UTILIZATION, \n",
" pos_label=0)\n",
"dir,br"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Question 1 - Apprendre un modèle pour prédire le fait d'être réadmis\n",
"### 1.1 - Faire le pre-processing des données\n",
"\n",
"Ici ce pre-processing a déjà été fait par AIF, nous avons simplement converti le dataset en dataframe pour pouvoir le manipuler librement"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Question 1.2 - Creer les échantillons d'apprentissage, de validation et de test\n",
"\n",
"Pour créer le df_X il faut enlever l'outcome (\"UTILIZATION\") et la pondération (\"WEIGHT\")\n",
"\n",
"La colonne \"UTILIZATION\" sera le label (noté y)\n",
"\n",
"La colonne \"WEIGHT\" sera la pondération (notée w)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"df_X = df.drop(columns=[\"UTILIZATION\", \"WEIGHT\"])\n",
"splits_trainval_test = train_test_split(\n",
" df_X, df[\"UTILIZATION\"], df[\"WEIGHT\"],\n",
" train_size=0.8, \n",
" random_state=42)\n",
"\n",
"X_trainval, X_test, y_trainval, y_test, w_trainval, w_test = splits_trainval_test\n",
"splits_train_val = train_test_split(\n",
" X_trainval, y_trainval, w_trainval,\n",
" train_size=0.625, \n",
" random_state=42)\n",
"X_train, X_val, y_train, y_val, w_train, w_val = splits_train_val"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((7915, 138),\n",
" (7915,),\n",
" (7915,),\n",
" (4749, 138),\n",
" (4749,),\n",
" (4749,),\n",
" (3166, 138),\n",
" (3166,),\n",
" (3166,))"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train.shape, y_train.shape, w_train.shape, X_val.shape, y_val.shape, w_val.shape, X_test.shape, y_test.shape, w_test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Question 1.3 - Apprendre une regression logistique dont le but est de prédire UTILIZATION"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8429447781148914"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"\n",
"model = make_pipeline(StandardScaler(),LogisticRegression(random_state=42))\n",
"\n",
"model = model.fit(X_train, y_train, **{'logisticregression__sample_weight':w_train})\n",
"\n",
"preds = model.predict(X_val)\n",
"\n",
"model.score(X_val, y_val, sample_weight=w_val)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Quesiton 1.4 Performance du modèle (afficher la matrice de confusion)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"3773 176 456 344\n"
]
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"coloraxis": "coloraxis",
"hovertemplate": "Pred: %{x}
Truth: %{y}
color: %{z}
\n", " | AGE | \n", "PCS42 | \n", "MCS42 | \n", "K6SUM42 | \n", "REGION=1 | \n", "REGION=2 | \n", "REGION=3 | \n", "REGION=4 | \n", "SEX=1 | \n", "SEX=2 | \n", "... | \n", "EMPST=4 | \n", "POVCAT=1 | \n", "POVCAT=2 | \n", "POVCAT=3 | \n", "POVCAT=4 | \n", "POVCAT=5 | \n", "INSCOV=1 | \n", "INSCOV=2 | \n", "INSCOV=3 | \n", "interest | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10862 | \n", "45.0 | \n", "48.45 | \n", "51.25 | \n", "2.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
11755 | \n", "79.0 | \n", "25.38 | \n", "56.48 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
11708 | \n", "6.0 | \n", "-1.00 | \n", "-1.00 | \n", "-1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "
12782 | \n", "3.0 | \n", "-1.00 | \n", "-1.00 | \n", "-1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "
1544 | \n", "9.0 | \n", "-1.00 | \n", "-1.00 | \n", "-1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
4585 | \n", "2.0 | \n", "-1.00 | \n", "-1.00 | \n", "-1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "
11815 | \n", "60.0 | \n", "43.99 | \n", "57.09 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
496 | \n", "62.0 | \n", "27.16 | \n", "29.66 | \n", "13.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
7753 | \n", "65.0 | \n", "36.10 | \n", "53.48 | \n", "5.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
10930 | \n", "70.0 | \n", "45.19 | \n", "36.55 | \n", "11.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "1.0 | \n", "0.0 | \n", "0.0 | \n", "1.0 | \n", "
7915 rows × 138 columns
\n", "