{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e316b5fe", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "\n", "sys.path.append(os.path.join(os.path.abspath(\"..\"), \"code\"))\n", "\n", "import IPython\n", "import matplotlib.pyplot as plt\n", "import mglearn\n", "import numpy as np\n", "import pandas as pd\n", "from IPython.display import HTML, display\n", "from plotting_functions import *\n", "from sklearn.dummy import DummyClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import average_precision_score, classification_report, f1_score, precision_recall_curve, roc_auc_score, roc_curve, ConfusionMatrixDisplay, PrecisionRecallDisplay, RocCurveDisplay\n", "from sklearn.model_selection import cross_val_score, cross_validate, train_test_split\n", "from sklearn.pipeline import Pipeline, make_pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from utils import *\n", "\n", "%matplotlib inline\n", "pd.set_option(\"display.max_colwidth\", 200)\n", "\n", "from IPython.display import Image" ] }, { "cell_type": "markdown", "id": "6f009286", "metadata": {}, "source": [ "# Exploring classification metrics" ] }, { "cell_type": "markdown", "id": "3d6e53c3", "metadata": {}, "source": [ "### Dataset for demonstration \n", "\n", "Let's classify fraudulent and non-fraudulent transactions using Kaggle's [Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud) data set." ] }, { "cell_type": "markdown", "id": "812c9f03", "metadata": {}, "source": [ "### Loading the data" ] }, { "cell_type": "code", "execution_count": 2, "id": "175ada15", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Time | \n", "V1 | \n", "V2 | \n", "V3 | \n", "V4 | \n", "V5 | \n", "V6 | \n", "V7 | \n", "V8 | \n", "V9 | \n", "... | \n", "V21 | \n", "V22 | \n", "V23 | \n", "V24 | \n", "V25 | \n", "V26 | \n", "V27 | \n", "V28 | \n", "Amount | \n", "Class | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
64454 | \n", "51150.0 | \n", "-3.538816 | \n", "3.481893 | \n", "-1.827130 | \n", "-0.573050 | \n", "2.644106 | \n", "-0.340988 | \n", "2.102135 | \n", "-2.939006 | \n", "2.578654 | \n", "... | \n", "0.530978 | \n", "-0.860677 | \n", "-0.201810 | \n", "-1.719747 | \n", "0.729143 | \n", "-0.547993 | \n", "-0.023636 | \n", "-0.454966 | \n", "1.00 | \n", "0 | \n", "
37906 | \n", "39163.0 | \n", "-0.363913 | \n", "0.853399 | \n", "1.648195 | \n", "1.118934 | \n", "0.100882 | \n", "0.423852 | \n", "0.472790 | \n", "-0.972440 | \n", "0.033833 | \n", "... | \n", "0.687055 | \n", "-0.094586 | \n", "0.121531 | \n", "0.146830 | \n", "-0.944092 | \n", "-0.558564 | \n", "-0.186814 | \n", "-0.257103 | \n", "18.49 | \n", "0 | \n", "
79378 | \n", "57994.0 | \n", "1.193021 | \n", "-0.136714 | \n", "0.622612 | \n", "0.780864 | \n", "-0.823511 | \n", "-0.706444 | \n", "-0.206073 | \n", "-0.016918 | \n", "0.781531 | \n", "... | \n", "-0.310405 | \n", "-0.842028 | \n", "0.085477 | \n", "0.366005 | \n", "0.254443 | \n", "0.290002 | \n", "-0.036764 | \n", "0.015039 | \n", "23.74 | \n", "0 | \n", "
245686 | \n", "152859.0 | \n", "1.604032 | \n", "-0.808208 | \n", "-1.594982 | \n", "0.200475 | \n", "0.502985 | \n", "0.832370 | \n", "-0.034071 | \n", "0.234040 | \n", "0.550616 | \n", "... | \n", "0.519029 | \n", "1.429217 | \n", "-0.139322 | \n", "-1.293663 | \n", "0.037785 | \n", "0.061206 | \n", "0.005387 | \n", "-0.057296 | \n", "156.52 | \n", "0 | \n", "
60943 | \n", "49575.0 | \n", "-2.669614 | \n", "-2.734385 | \n", "0.662450 | \n", "-0.059077 | \n", "3.346850 | \n", "-2.549682 | \n", "-1.430571 | \n", "-0.118450 | \n", "0.469383 | \n", "... | \n", "-0.228329 | \n", "-0.370643 | \n", "-0.211544 | \n", "-0.300837 | \n", "-1.174590 | \n", "0.573818 | \n", "0.388023 | \n", "0.161782 | \n", "57.50 | \n", "0 | \n", "
5 rows × 31 columns
\n", "Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression', LogisticRegression(max_iter=500))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression', LogisticRegression(max_iter=500))])
StandardScaler()
LogisticRegression(max_iter=500)
Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('standardscaler', StandardScaler()), ('svc', SVC())])
StandardScaler()
SVC()
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression', LogisticRegression(max_iter=500))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression', LogisticRegression(max_iter=500))])
StandardScaler()
LogisticRegression(max_iter=500)
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression',\n", " LogisticRegression(class_weight={0: 1, 1: 10}, max_iter=500))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression',\n", " LogisticRegression(class_weight={0: 1, 1: 10}, max_iter=500))])
StandardScaler()
LogisticRegression(class_weight={0: 1, 1: 10}, max_iter=500)
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression',\n", " LogisticRegression(class_weight='balanced', max_iter=500))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('standardscaler', StandardScaler()),\n", " ('logisticregression',\n", " LogisticRegression(class_weight='balanced', max_iter=500))])
StandardScaler()
LogisticRegression(class_weight='balanced', max_iter=500)