Exploring classification metrics

import os
import sys

sys.path.append(os.path.join(os.path.abspath(".."), "code"))

import IPython
import matplotlib.pyplot as plt
import mglearn
import numpy as np
import pandas as pd
from IPython.display import HTML, display
from plotting_functions import *
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, classification_report, f1_score, precision_recall_curve, roc_auc_score, roc_curve, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from utils import *

%matplotlib inline
pd.set_option("display.max_colwidth", 200)

from IPython.display import Image

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
Cell In[1], line 8
      6 import IPython
      7 import matplotlib.pyplot as plt
----> 8 import mglearn
      9 import numpy as np
     10 import pandas as pd

ModuleNotFoundError: No module named 'mglearn'

Exploring classification metrics#

Dataset for demonstration#

Let’s classify fraudulent and non-fraudulent transactions using Kaggle’s Credit Card Fraud Detection data set.

Loading the data#

cc_df = pd.read_csv("../data/creditcard.csv", encoding="latin-1")
train_df, test_df = train_test_split(cc_df, test_size=0.3, random_state=111)
train_df.head()

	Time	V1	V2	V3	V4	V5	V6	V7	V8	V9	...	V21	V22	V23	V24	V25	V26	V27	V28	Amount
64454	51150.0	-3.538816	3.481893	-1.827130	-0.573050	2.644106	-0.340988	2.102135	-2.939006	2.578654	...	0.530978	-0.860677	-0.201810	-1.719747	0.729143	-0.547993	-0.023636	-0.454966	1.00
37906	39163.0	-0.363913	0.853399	1.648195	1.118934	0.100882	0.423852	0.472790	-0.972440	0.033833	...	0.687055	-0.094586	0.121531	0.146830	-0.944092	-0.558564	-0.186814	-0.257103	18.49
79378	57994.0	1.193021	-0.136714	0.622612	0.780864	-0.823511	-0.706444	-0.206073	-0.016918	0.781531	...	-0.310405	-0.842028	0.085477	0.366005	0.254443	0.290002	-0.036764	0.015039	23.74
245686	152859.0	1.604032	-0.808208	-1.594982	0.200475	0.502985	0.832370	-0.034071	0.234040	0.550616	...	0.519029	1.429217	-0.139322	-1.293663	0.037785	0.061206	0.005387	-0.057296	156.52
60943	49575.0	-2.669614	-2.734385	0.662450	-0.059077	3.346850	-2.549682	-1.430571	-0.118450	0.469383	...	-0.228329	-0.370643	-0.211544	-0.300837	-1.174590	0.573818	0.388023	0.161782	57.50

5 rows × 31 columns

X_train_big, y_train_big = train_df.drop(columns=["Class", "Time"]), train_df["Class"]
X_test, y_test = test_df.drop(columns=["Class", "Time"]), test_df["Class"]
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_big, y_train_big, test_size=0.7, random_state=123
)

Comparing PR curves#

Let’s create PR curves for SVC and Logisitic Regression

pipe_lr = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))
pipe_lr.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=500))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

pipe_svc = make_pipeline(StandardScaler(), SVC())
pipe_svc.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

pipe_lr.predict_proba(X_valid)

array([[9.99807444e-01, 1.92556242e-04],
       [9.99537296e-01, 4.62703836e-04],
       [9.99678149e-01, 3.21851032e-04],
       ...,
       [9.99907898e-01, 9.21018764e-05],
       [9.99882185e-01, 1.17814723e-04],
       [9.99845434e-01, 1.54565766e-04]])

precision_lr, recall_lr, thresholds_lr = precision_recall_curve(
    y_valid, pipe_lr.predict_proba(X_valid)[:, 1]
)

thresholds_lr

array([5.43344135e-10, 8.87215534e-10, 1.13373820e-09, ...,
       9.99999990e-01, 9.99999996e-01, 1.00000000e+00])

pipe_svc.decision_function(X_valid)

array([-1.12247976, -1.15220481, -1.05407669, ..., -1.18444729,
       -1.06337006, -1.05482241])

precision_svc, recall_svc, thresholds_svc = precision_recall_curve(
    y_valid, pipe_svc.decision_function(X_valid)
)

plt.plot(precision_svc, recall_svc, label="SVC")
plt.plot(precision_lr, recall_lr, label="Logistic regression")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best");

../../_images/a8665e933b6521158ce98d3cc24ce696a374c99b78bdfdc68ed72058e584583b.png

Let’s look at the F1 scores#

lr_f1 = f1_score(y_valid, pipe_lr.predict(X_valid))
svc_f1 = f1_score(y_valid, pipe_svc.predict(X_valid))

print(lr_f1, svc_f1)

0.6463104325699746 0.553314121037464

What about the average precision score#

lr_ap = average_precision_score(y_valid, pipe_lr.predict_proba(X_valid)[:, 1])
svc_ap = average_precision_score(y_valid, pipe_svc.decision_function(X_valid))

print(lr_ap, svc_ap)

0.698147830641323 0.7629107575877127

Comparing ROC curves#

Let’s look at the ROC curve for Logistic Regression first

RocCurveDisplay.from_estimator(pipe_lr, X_valid, y_valid, name="Logistic Regression")

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x124c45bd0>

../../_images/088c17c07385bdb4fd0f2023b5fcf6e970895f4996e4333ef11f7842210b80d6.png

But what if we want to plot more than one classifier? Let’s look at the documentation.

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
RocCurveDisplay.from_estimator(pipe_lr, X_valid, y_valid, ax=ax, name="Logistic Regression")
RocCurveDisplay.from_estimator(pipe_svc, X_valid, y_valid, ax=ax, name="SVC")

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x125924ed0>

../../_images/6addbef62e8bb093655bf746a1647fa0e119f2af61d48e19783462640fb197e1.png

Comparing class_weight#

Let’s explore how the class_weight argument impacts performance.

# Standard LogisticRegression
pipe_lr_std = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500))
pipe_lr_std.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=500))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Giving a weight of 1 to the non-fraud and 10 to fraud examples
pipe_lr_upw = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, class_weight={0:1, 1:10}))
pipe_lr_upw.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight={0: 1, 1: 10}, max_iter=500))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Balanced weights
pipe_lr_balanced = make_pipeline(StandardScaler(), LogisticRegression(max_iter=500, class_weight="balanced"))
pipe_lr_balanced.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=500))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

First let’s look at the precision-recall curves

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
PrecisionRecallDisplay.from_estimator(pipe_lr_std, X_valid, y_valid, ax=ax, name="Std")
PrecisionRecallDisplay.from_estimator(pipe_lr_upw, X_valid, y_valid, ax=ax, name="Upweight")
PrecisionRecallDisplay.from_estimator(pipe_lr_balanced, X_valid, y_valid, ax=ax, name="Balanced")

<sklearn.metrics._plot.precision_recall_curve.PrecisionRecallDisplay at 0x1260725d0>

../../_images/9d1c5f20c36a4889aef7db061d6285e64beb5dbf42f9c3242a8f660c06989f80.png

Now let’s consider the ROC curves

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
RocCurveDisplay.from_estimator(pipe_lr_std, X_valid, y_valid, ax=ax, name="Std")
RocCurveDisplay.from_estimator(pipe_lr_upw, X_valid, y_valid, ax=ax, name="Upweight")
RocCurveDisplay.from_estimator(pipe_lr_balanced, X_valid, y_valid, ax=ax, name="Balanced")

<sklearn.metrics._plot.roc_curve.RocCurveDisplay at 0x1260716d0>

../../_images/4fd358abe0b07827d9eec8dabf851cd9882000b06abaa50c2f264f84a6664a18.png

ML fairness activity#

AI/ML systems can give the illusion of objectivity as they are derived from seemingly unbiased data & algorithm. However, human are inherently biased and AI/ML systems, if not carefully evaluated, can even further amplify the existing inequities and systemic bias in our society.

How do we make sure our AI/ML systems are fair? Which metrics can we use to quatify ‘fairness’ in AI/ML systems?

Dataset for demonstration#

Let’s examine this on the adult census data set.

census_df = pd.read_csv("../data/adult.csv")
census_df.shape

train_df, test_df = train_test_split(census_df, test_size=0.4, random_state=42)

Data cleaning#

train_df_nan = train_df.replace("?", np.nan)
test_df_nan = test_df.replace("?", np.nan)
train_df_nan.shape

train_df_nan.head()

numeric_features = [
    "age",
    "capital.gain",
    "capital.loss",
    "hours.per.week",
]

categorical_features = [
    "workclass",
    "marital.status",
    "occupation",
    "relationship",
    "race",
    "native.country",
]

ordinal_features = ["education"]
binary_features = [
    "sex"
]  # Not binary in general but in this particular dataset it seems to have only two possible values
drop_features = ["education.num", "fnlwgt"]
target = "income"

train_df["education"].unique()

education_levels = [
    "Preschool",
    "1st-4th",
    "5th-6th",
    "7th-8th",
    "9th",
    "10th",
    "11th",
    "12th",
    "HS-grad",
    "Prof-school",
    "Assoc-voc",
    "Assoc-acdm",
    "Some-college",
    "Bachelors",
    "Masters",
    "Doctorate",
]

assert set(education_levels) == set(train_df["education"].unique())

X_train = train_df_nan.drop(columns=[target])
y_train = train_df_nan[target]

X_test = test_df_nan.drop(columns=[target])
y_test = test_df_nan[target]

from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

numeric_transformer = make_pipeline(StandardScaler())

ordinal_transformer = OrdinalEncoder(categories=[education_levels], dtype=int)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

binary_transformer = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(drop="if_binary", dtype=int),
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (ordinal_transformer, ordinal_features),
    (binary_transformer, binary_features),
    (categorical_transformer, categorical_features),
    ("drop", drop_features),
)

y_train.value_counts()

Let’s build our classification pipeline

pipe_lr =

And look at the confustion matrix

Let’s examine confusion matrix separately for the two genders we have in the data.

X_train_enc = preprocessor.fit_transform(X_train)
preprocessor.named_transformers_["pipeline-2"]["onehotencoder"].get_feature_names_out()

X_test.head()

X_female = X_test.query("sex=='Female'")  # X where sex is female
X_male = X_test.query("sex=='Male'")  # X where sex is male

y_female = y_test[X_female.index]  # y where sex is female
y_male = y_test[X_male.index]  # y where sex is male

Get predictions for X_female and y_male with pipe_lr

female_preds = pipe_lr.predict(X_female)
male_preds = pipe_lr.predict(X_male)

Let’s examine the accuracy and confusion matrix for female class.

print(classification_report(y_female, female_preds))

ConfusionMatrixDisplay.from_estimator(pipe_lr, X_female, y_female, normalize="true");

Let’s examine the accuracy and confusion matrix for male class.

print(classification_report(y_male, male_preds))

ConfusionMatrixDisplay.from_estimator(pipe_lr, X_male, y_male, normalize="true");

❓❓ Questions for group discussion#

Let’s assume that a company is using this classifier for loan approval with a simple rule that if the income is >=50K, approve the loan else reject the loan.

In your group, discuss the questions below and write the main points from your discussion in this Google document.

Which group has a higher accuracy?
Which group has a higher precision for class >50K? What about recall for class >50K?
Will both groups have more or less the same proportion of people with approved loans?
If a male and a female have both a certain level of income, will they have the same chance of getting the loan?
Banks want to avoid approving unqualified applications (false positives) because default loan could have detrimental effects for them. Compare the false positive rates for the two groups.
Overall, do you think this income classifier will fairly treat both groups? What will be the consequences of using this classifier in loan approval application?

Time permitting

Do you think the effect will still exist if the sex feature is removed from the model (but you still have it available separately to do the two confusion matrices)?
Are there any other groups in this dataset worth examining for biases?

Exploring classification metrics

Contents

Exploring classification metrics#

Dataset for demonstration#

Loading the data#

Comparing PR curves#

Let’s look at the F1 scores#

What about the average precision score#

Comparing ROC curves#

Comparing class_weight#

ML fairness activity#

Dataset for demonstration#

Data cleaning#

❓❓ Questions for group discussion#