import os
import sys

sys.path.append(os.path.join(os.path.abspath(".."), "code"))

import IPython
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd
from IPython.display import HTML, display
from plotting_functions import *
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, classification_report, f1_score, precision_recall_curve, roc_auc_score, roc_curve, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from utils import *

%matplotlib inline
pd.set_option("display.max_colwidth", 200)

from IPython.display import Image
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 8
      6 import IPython
      7 import matplotlib.pyplot as plt
----> 8 import mglearn
      9 import numpy as np
     10 import pandas as pd

ModuleNotFoundError: No module named 'mglearn'

Exploring classification metrics#

Dataset for demonstration#

Let’s classify fraudulent and non-fraudulent transactions using Kaggle’s Credit Card Fraud Detection data set.

Loading the data#

cc_df = pd.read_csv("../data/creditcard.csv", encoding="latin-1")
train_df, test_df = train_test_split(cc_df, test_size=0.3, random_state=111)
train_df.head()
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
64454 51150.0 -3.538816 3.481893 -1.827130 -0.573050 2.644106 -0.340988 2.102135 -2.939006 2.578654 ... 0.530978 -0.860677 -0.201810 -1.719747 0.729143 -0.547993 -0.023636 -0.454966 1.00 0
37906 39163.0 -0.363913 0.853399 1.648195 1.118934 0.100882 0.423852 0.472790 -0.972440 0.033833 ... 0.687055 -0.094586 0.121531 0.146830 -0.944092 -0.558564 -0.186814 -0.257103 18.49 0
79378 57994.0 1.193021 -0.136714 0.622612 0.780864 -0.823511 -0.706444 -0.206073 -0.016918 0.781531 ... -0.310405 -0.842028 0.085477 0.366005 0.254443 0.290002 -0.036764 0.015039 23.74 0
245686 152859.0 1.604032 -0.808208 -1.594982 0.200475 0.502985 0.832370 -0.034071 0.234040 0.550616 ... 0.519029 1.429217 -0.139322 -1.293663 0.037785 0.061206 0.005387 -0.057296 156.52 0
60943 49575.0 -2.669614 -2.734385 0.662450 -0.059077 3.346850 -2.549682 -1.430571 -0.118450 0.469383 ... -0.228329 -0.370643 -0.211544 -0.300837 -1.174590 0.573818 0.388023 0.161782 57.50 0

5 rows × 31 columns

X_train_big, y_train_big = train_df.drop(columns=["Class", "Time"]), train_df["Class"]
X_test, y_test = test_df.drop(columns=["Class", "Time"]), test_df["Class"]
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_big, y_train_big, test_size=0.7, random_state=123
)

Comparing PR curves#

Let’s create PR curves for SVC and Logisitic Regression

pipe_lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))
pipe_lr.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=500))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
pipe_svc = make_pipeline(StandardScaler(), SVC())
pipe_svc.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
pipe_lr.predict_proba(X_valid)
array([[9.99807444e-01, 1.92556242e-04],
       [9.99537296e-01, 4.62703836e-04],
       [9.99678149e-01, 3.21851032e-04],
       ...,
       [9.99907898e-01, 9.21018764e-05],
       [9.99882185e-01, 1.17814723e-04],
       [9.99845434e-01, 1.54565766e-04]])
precision_lr, recall_lr, thresholds_lr = precision_recall_curve(
    y_valid, pipe_lr.predict_proba(X_valid)[:, 1]
)
thresholds_lr
array([5.43344135e-10, 8.87215534e-10, 1.13373820e-09, ...,
       9.99999990e-01, 9.99999996e-01, 1.00000000e+00])
pipe_svc.decision_function(X_valid)
array([-1.12247976, -1.15220481, -1.05407669, ..., -1.18444729,
       -1.06337006, -1.05482241])
precision_svc, recall_svc, thresholds_svc = precision_recall_curve(
    y_valid, pipe_svc.decision_function(X_valid)
)
plt.plot(precision_svc, recall_svc, label="SVC")
plt.plot(precision_lr, recall_lr, label="Logistic regression")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best");
../../_images/a8665e933b6521158ce98d3cc24ce696a374c99b78bdfdc68ed72058e584583b.png

Let’s look at the F1 scores#

lr_f1 = f1_score(y_valid, pipe_lr.predict(X_valid))
svc_f1 = f1_score(y_valid, pipe_svc.predict(X_valid))

print(lr_f1, svc_f1)
0.6463104325699746 0.553314121037464

What about the average precision score#

lr_ap = average_precision_score(y_valid, pipe_lr.predict_proba(X_valid)[:, 1])
svc_ap = average_precision_score(y_valid, pipe_svc.decision_function(X_valid))

print(lr_ap, svc_ap)
0.698147830641323 0.7629107575877127

Comparing ROC curves#

Let’s look at the ROC curve for Logistic Regression first

RocCurveDisplay.from_estimator(pipe_lr, X_valid, y_valid, name="Logistic Regression")
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x124c45bd0>
../../_images/088c17c07385bdb4fd0f2023b5fcf6e970895f4996e4333ef11f7842210b80d6.png

But what if we want to plot more than one classifier? Let’s look at the documentation.

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
RocCurveDisplay.from_estimator(pipe_lr, X_valid, y_valid, ax=ax, name="Logistic Regression")
RocCurveDisplay.from_estimator(pipe_svc, X_valid, y_valid, ax=ax, name="SVC")
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x125924ed0>
../../_images/6addbef62e8bb093655bf746a1647fa0e119f2af61d48e19783462640fb197e1.png

Comparing class_weight#

Let’s explore how the class_weight argument impacts performance.

# Standard LogisticRegression
pipe_lr_std = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))
pipe_lr_std.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=500))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Giving a weight of 1 to the non-fraud and 10 to fraud examples
pipe_lr_upw = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, class_weight={0:1, 1:10}))
pipe_lr_upw.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight={0: 1, 1: 10}, max_iter=500))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Balanced weights
pipe_lr_balanced = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, class_weight="balanced"))
pipe_lr_balanced.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=500))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

First let’s look at the precision-recall curves

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
PrecisionRecallDisplay.from_estimator(pipe_lr_std, X_valid, y_valid, ax=ax, name="Std")
PrecisionRecallDisplay.from_estimator(pipe_lr_upw, X_valid, y_valid, ax=ax, name="Upweight")
PrecisionRecallDisplay.from_estimator(pipe_lr_balanced, X_valid, y_valid, ax=ax, name="Balanced")
<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x1260725d0>
../../_images/9d1c5f20c36a4889aef7db061d6285e64beb5dbf42f9c3242a8f660c06989f80.png

Now let’s consider the ROC curves

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
RocCurveDisplay.from_estimator(pipe_lr_std, X_valid, y_valid, ax=ax, name="Std")
RocCurveDisplay.from_estimator(pipe_lr_upw, X_valid, y_valid, ax=ax, name="Upweight")
RocCurveDisplay.from_estimator(pipe_lr_balanced, X_valid, y_valid, ax=ax, name="Balanced")
<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1260716d0>
../../_images/4fd358abe0b07827d9eec8dabf851cd9882000b06abaa50c2f264f84a6664a18.png

ML fairness activity#

AI/ML systems can give the illusion of objectivity as they are derived from seemingly unbiased data & algorithm. However, human are inherently biased and AI/ML systems, if not carefully evaluated, can even further amplify the existing inequities and systemic bias in our society.

How do we make sure our AI/ML systems are fair? Which metrics can we use to quatify ‘fairness’ in AI/ML systems?

Dataset for demonstration#

Let’s examine this on the adult census data set.

census_df = pd.read_csv("../data/adult.csv")
census_df.shape
train_df, test_df = train_test_split(census_df, test_size=0.4, random_state=42)

Data cleaning#

train_df_nan = train_df.replace("?", np.nan)
test_df_nan = test_df.replace("?", np.nan)
train_df_nan.shape
train_df_nan.head()
numeric_features = [
    "age",
    "capital.gain",
    "capital.loss",
    "hours.per.week",
]

categorical_features = [
    "workclass",
    "marital.status",
    "occupation",
    "relationship",
    "race",
    "native.country",
]

ordinal_features = ["education"]
binary_features = [
    "sex"
]  # Not binary in general but in this particular dataset it seems to have only two possible values
drop_features = ["education.num", "fnlwgt"]
target = "income"
train_df["education"].unique()
education_levels = [
    "Preschool",
    "1st-4th",
    "5th-6th",
    "7th-8th",
    "9th",
    "10th",
    "11th",
    "12th",
    "HS-grad",
    "Prof-school",
    "Assoc-voc",
    "Assoc-acdm",
    "Some-college",
    "Bachelors",
    "Masters",
    "Doctorate",
]
assert set(education_levels) == set(train_df["education"].unique())
X_train = train_df_nan.drop(columns=[target])
y_train = train_df_nan[target]

X_test = test_df_nan.drop(columns=[target])
y_test = test_df_nan[target]
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

numeric_transformer = make_pipeline(StandardScaler())

ordinal_transformer = OrdinalEncoder(categories=[education_levels], dtype=int)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

binary_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(drop="if_binary", dtype=int),
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (ordinal_transformer, ordinal_features),
    (binary_transformer, binary_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)
y_train.value_counts()

Let’s build our classification pipeline

pipe_lr = 

And look at the confustion matrix

Let’s examine confusion matrix separately for the two genders we have in the data.

X_train_enc = preprocessor.fit_transform(X_train)
preprocessor.named_transformers_["pipeline-2"]["onehotencoder"].get_feature_names_out()
X_test.head()
X_female = X_test.query("sex=='Female'")  # X where sex is female
X_male = X_test.query("sex=='Male'")  # X where sex is male

y_female = y_test[X_female.index]  # y where sex is female
y_male = y_test[X_male.index]  # y where sex is male

Get predictions for X_female and y_male with pipe_lr

female_preds = pipe_lr.predict(X_female)
male_preds = pipe_lr.predict(X_male)

Let’s examine the accuracy and confusion matrix for female class.

print(classification_report(y_female, female_preds))
ConfusionMatrixDisplay.from_estimator(pipe_lr, X_female, y_female, normalize="true");

Let’s examine the accuracy and confusion matrix for male class.

print(classification_report(y_male, male_preds))
ConfusionMatrixDisplay.from_estimator(pipe_lr, X_male, y_male, normalize="true");

❓❓ Questions for group discussion#

Let’s assume that a company is using this classifier for loan approval with a simple rule that if the income is >=50K, approve the loan else reject the loan.

In your group, discuss the questions below and write the main points from your discussion in this Google document.

  1. Which group has a higher accuracy?

  2. Which group has a higher precision for class >50K? What about recall for class >50K?

  3. Will both groups have more or less the same proportion of people with approved loans?

  4. If a male and a female have both a certain level of income, will they have the same chance of getting the loan?

  5. Banks want to avoid approving unqualified applications (false positives) because default loan could have detrimental effects for them. Compare the false positive rates for the two groups.

  6. Overall, do you think this income classifier will fairly treat both groups? What will be the consequences of using this classifier in loan approval application?

Time permitting

  1. Do you think the effect will still exist if the sex feature is removed from the model (but you still have it available separately to do the two confusion matrices)?

  2. Are there any other groups in this dataset worth examining for biases?