{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Appendix A: Demo of feature engineering for text data "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We will be using [Covid tweets](https://www.kaggle.com/code/kerneler/starter-covid-19-nlp-text-d3a3baa6-e/data) dataset for this. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sentiment\n",
       "Negative              1041\n",
       "Positive               947\n",
       "Neutral                619\n",
       "Extremely Positive     599\n",
       "Extremely Negative     592\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(DATA_DIR + 'Corona_NLP_test.csv')\n",
    "df['Sentiment'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>UserName</th>\n",
       "      <th>ScreenName</th>\n",
       "      <th>Location</th>\n",
       "      <th>TweetAt</th>\n",
       "      <th>OriginalTweet</th>\n",
       "      <th>Sentiment</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1927</th>\n",
       "      <td>1928</td>\n",
       "      <td>46880</td>\n",
       "      <td>Seattle, WA</td>\n",
       "      <td>13-03-2020</td>\n",
       "      <td>While I don't like all of Amazon's choices, to...</td>\n",
       "      <td>Positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1068</th>\n",
       "      <td>1069</td>\n",
       "      <td>46021</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13-03-2020</td>\n",
       "      <td>Me: shit buckets, its time to do the weekly s...</td>\n",
       "      <td>Negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>803</th>\n",
       "      <td>804</td>\n",
       "      <td>45756</td>\n",
       "      <td>The Outer Limits</td>\n",
       "      <td>12-03-2020</td>\n",
       "      <td>@SecPompeo @realDonaldTrump You mean the plan ...</td>\n",
       "      <td>Neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2846</th>\n",
       "      <td>2847</td>\n",
       "      <td>47799</td>\n",
       "      <td>Flagstaff, AZ</td>\n",
       "      <td>15-03-2020</td>\n",
       "      <td>@lauvagrande People who are sick arent panic ...</td>\n",
       "      <td>Extremely Negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3768</th>\n",
       "      <td>3769</td>\n",
       "      <td>48721</td>\n",
       "      <td>Montreal, Canada</td>\n",
       "      <td>16-03-2020</td>\n",
       "      <td>Coronavirus Panic: Toilet Paper Is the People...</td>\n",
       "      <td>Negative</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1122</th>\n",
       "      <td>1123</td>\n",
       "      <td>46075</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13-03-2020</td>\n",
       "      <td>Photos of our local grocery store shelveswher...</td>\n",
       "      <td>Extremely Positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1346</th>\n",
       "      <td>1347</td>\n",
       "      <td>46299</td>\n",
       "      <td>Toronto</td>\n",
       "      <td>13-03-2020</td>\n",
       "      <td>Just went to the the grocery store (Highland F...</td>\n",
       "      <td>Positive</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3454</th>\n",
       "      <td>3455</td>\n",
       "      <td>48407</td>\n",
       "      <td>Houston, TX</td>\n",
       "      <td>16-03-2020</td>\n",
       "      <td>Real talk though. Am I the only one spending h...</td>\n",
       "      <td>Neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3437</th>\n",
       "      <td>3438</td>\n",
       "      <td>48390</td>\n",
       "      <td>Washington, DC</td>\n",
       "      <td>16-03-2020</td>\n",
       "      <td>The supermarket business is booming! #COVID2019</td>\n",
       "      <td>Neutral</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3582</th>\n",
       "      <td>3583</td>\n",
       "      <td>48535</td>\n",
       "      <td>St James' Park, Newcastle</td>\n",
       "      <td>16-03-2020</td>\n",
       "      <td>Evening All Here s the story on the and the im...</td>\n",
       "      <td>Positive</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3038 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      UserName  ScreenName                    Location     TweetAt  \\\n",
       "1927      1928       46880                 Seattle, WA  13-03-2020   \n",
       "1068      1069       46021                         NaN  13-03-2020   \n",
       "803        804       45756            The Outer Limits  12-03-2020   \n",
       "2846      2847       47799               Flagstaff, AZ  15-03-2020   \n",
       "3768      3769       48721            Montreal, Canada  16-03-2020   \n",
       "...        ...         ...                         ...         ...   \n",
       "1122      1123       46075                         NaN  13-03-2020   \n",
       "1346      1347       46299                     Toronto  13-03-2020   \n",
       "3454      3455       48407                 Houston, TX  16-03-2020   \n",
       "3437      3438       48390              Washington, DC  16-03-2020   \n",
       "3582      3583       48535  St James' Park, Newcastle   16-03-2020   \n",
       "\n",
       "                                          OriginalTweet           Sentiment  \n",
       "1927  While I don't like all of Amazon's choices, to...            Positive  \n",
       "1068  Me: shit buckets, its time to do the weekly s...            Negative  \n",
       "803   @SecPompeo @realDonaldTrump You mean the plan ...             Neutral  \n",
       "2846  @lauvagrande People who are sick arent panic ...  Extremely Negative  \n",
       "3768  Coronavirus Panic: Toilet Paper Is the People...            Negative  \n",
       "...                                                 ...                 ...  \n",
       "1122  Photos of our local grocery store shelveswher...  Extremely Positive  \n",
       "1346  Just went to the the grocery store (Highland F...            Positive  \n",
       "3454  Real talk though. Am I the only one spending h...             Neutral  \n",
       "3437    The supermarket business is booming! #COVID2019             Neutral  \n",
       "3582  Evening All Here s the story on the and the im...            Positive  \n",
       "\n",
       "[3038 rows x 6 columns]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',\n",
       "       'Sentiment'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Location\n",
       "United States                     63\n",
       "London, England                   37\n",
       "Los Angeles, CA                   30\n",
       "New York, NY                      29\n",
       "Washington, DC                    29\n",
       "                                  ..\n",
       "Suburb of Chicago                  1\n",
       "philippines                        1\n",
       "Dont ask for freedom, take it.     1\n",
       "Windsor Heights, IA                1\n",
       "St James' Park, Newcastle          1\n",
       "Name: count, Length: 1441, dtype: int64"
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df['Location'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, y_train = train_df[['OriginalTweet', 'Location']], train_df['Sentiment']\n",
    "X_test, y_test = test_df[['OriginalTweet', 'Location']], test_df['Sentiment']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sentiment\n",
       "Negative              852\n",
       "Positive              743\n",
       "Neutral               501\n",
       "Extremely Negative    472\n",
       "Extremely Positive    470\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_train.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "scoring_metrics = 'accuracy'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):\n",
    "    \"\"\"\n",
    "    Returns mean and std of cross validation\n",
    "\n",
    "    Parameters\n",
    "    ----------\n",
    "    model :\n",
    "        scikit-learn model\n",
    "    X_train : numpy array or pandas DataFrame\n",
    "        X in the training data\n",
    "    y_train :\n",
    "        y in the training data\n",
    "\n",
    "    Returns\n",
    "    ----------\n",
    "        pandas Series with mean scores from cross_validation\n",
    "    \"\"\"\n",
    "\n",
    "    scores = cross_validate(model, X_train, y_train, **kwargs)\n",
    "\n",
    "    mean_scores = pd.DataFrame(scores).mean()\n",
    "    std_scores = pd.DataFrame(scores).std()\n",
    "    out_col = []\n",
    "\n",
    "    for i in range(len(mean_scores)):\n",
    "        out_col.append((f\"%0.3f (+/- %0.3f)\" % (mean_scores[i], std_scores[i])))\n",
    "\n",
    "    return pd.Series(data=out_col, index=mean_scores.index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dummy classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/b3/g26r0dcx4b35vf3nk31216hc0000gr/T/ipykernel_13054/4158382658.py:26: FutureWarning:\n",
      "\n",
      "Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fit_time</th>\n",
       "      <th>score_time</th>\n",
       "      <th>test_score</th>\n",
       "      <th>train_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dummy</th>\n",
       "      <td>0.001 (+/- 0.001)</td>\n",
       "      <td>0.001 (+/- 0.001)</td>\n",
       "      <td>0.280 (+/- 0.001)</td>\n",
       "      <td>0.280 (+/- 0.000)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                fit_time         score_time         test_score  \\\n",
       "dummy  0.001 (+/- 0.001)  0.001 (+/- 0.001)  0.280 (+/- 0.001)   \n",
       "\n",
       "             train_score  \n",
       "dummy  0.280 (+/- 0.000)  "
      ]
     },
     "execution_count": 49,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dummy = DummyClassifier()\n",
    "results[\"dummy\"] = mean_std_cross_val_scores(\n",
    "    dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics\n",
    ")\n",
    "pd.DataFrame(results).T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag-of-words model "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/b3/g26r0dcx4b35vf3nk31216hc0000gr/T/ipykernel_13054/4158382658.py:26: FutureWarning:\n",
      "\n",
      "Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fit_time</th>\n",
       "      <th>score_time</th>\n",
       "      <th>test_score</th>\n",
       "      <th>train_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dummy</th>\n",
       "      <td>0.001 (+/- 0.001)</td>\n",
       "      <td>0.001 (+/- 0.001)</td>\n",
       "      <td>0.280 (+/- 0.001)</td>\n",
       "      <td>0.280 (+/- 0.000)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>logistic regression</th>\n",
       "      <td>0.278 (+/- 0.019)</td>\n",
       "      <td>0.008 (+/- 0.000)</td>\n",
       "      <td>0.414 (+/- 0.012)</td>\n",
       "      <td>0.999 (+/- 0.000)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              fit_time         score_time         test_score  \\\n",
       "dummy                0.001 (+/- 0.001)  0.001 (+/- 0.001)  0.280 (+/- 0.001)   \n",
       "logistic regression  0.278 (+/- 0.019)  0.008 (+/- 0.000)  0.414 (+/- 0.012)   \n",
       "\n",
       "                           train_score  \n",
       "dummy                0.280 (+/- 0.000)  \n",
       "logistic regression  0.999 (+/- 0.000)  "
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "pipe = make_pipeline(CountVectorizer(stop_words='english'), \n",
    "                     LogisticRegression(max_iter=1000))\n",
    "results[\"logistic regression\"] = mean_std_cross_val_scores(\n",
    "    pipe, X_train['OriginalTweet'], y_train, return_train_score=True, scoring=scoring_metrics\n",
    ")\n",
    "pd.DataFrame(results).T"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Is it possible to further improve the scores?\n",
    "\n",
    "- How about adding new features based on our intuitions? Let's extract our own features that might be useful for this prediction task. In other words, let's carry out **feature engineering**. \n",
    "\n",
    "- The code below adds some very basic length-related and sentiment features. We will be using a popular library called `nltk` for this exercise. If you have successfully created the course `conda` environment on your machine, you should already have this package in the environment.  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "- How do we extract interesting information from text?\n",
    "- We use **pre-trained models**! "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "- A couple of popular libraries which include such pre-trained models. \n",
    "- `nltk`\n",
    "```\n",
    "conda install -c anaconda nltk \n",
    "```        \n",
    "- spaCy\n",
    "```\n",
    "conda install -c conda-forge spacy\n",
    "```\n",
    "\n",
    "For emoji support: \n",
    "```\n",
    "pip install spacymoji\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- You also need to download the language model which contains all the pre-trained models. For that run the following in your course `conda` environment or here.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "import spacy\n",
    "\n",
    "# !python -m spacy download en_core_web_md"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /Users/kvarada/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "\n",
    "nltk.download(\"punkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package vader_lexicon to\n",
      "[nltk_data]     /Users/kvarada/nltk_data...\n",
      "[nltk_data]   Package vader_lexicon is already up-to-date!\n",
      "[nltk_data] Downloading package punkt to /Users/kvarada/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "nltk.download(\"vader_lexicon\")\n",
    "nltk.download(\"punkt\")\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "\n",
    "sid = SentimentIntensityAnalyzer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'neg': 0.0, 'neu': 0.368, 'pos': 0.632, 'compound': 0.8225}\n"
     ]
    }
   ],
   "source": [
    "s = \"CPSC 330 students are smart, sweet, and funny.\"\n",
    "print(sid.polarity_scores(s))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'neg': 0.249, 'neu': 0.751, 'pos': 0.0, 'compound': -0.5106}\n"
     ]
    }
   ],
   "source": [
    "s = \"CPSC 330 students are tired because of all the hard work they have been doing.\"\n",
    "print(sid.polarity_scores(s))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "### [spaCy](https://spacy.io/) \n",
    "\n",
    "A useful package for text processing and feature extraction\n",
    "- Active development: https://github.com/explosion/spaCy\n",
    "- Interactive lessons by Ines Montani: https://course.spacy.io/en/\n",
    "- Good documentation, easy to use, and customizable.\n",
    "\n",
    "To run the code below, you have to download the pretrained model in the course environment. \n",
    "\n",
    "> python -m spacy download en_core_web_md"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "import spacy\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_md\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "sample_text = \"\"\"Dolly Parton is a gift to us all. \n",
    "From writing all-time great songs like “Jolene” and “I Will Always Love You”, \n",
    "to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine, \n",
    "she’s given us so much. Now, Netflix bring us Dolly Parton’s Christmas on the Square, \n",
    "an original musical that stars Christine Baranski as a Scrooge-like landowner \n",
    "who threatens to evict an entire town on Christmas Eve to make room for a new mall. \n",
    "Directed and choreographed by the legendary Debbie Allen and counting Jennifer Lewis \n",
    "and Parton herself amongst its cast, Christmas on the Square seems like the perfect movie\n",
    "to save Christmas 2020. 😻 👍🏿\"\"\"\n",
    "\n",
    "# [Adapted from here.](https://thepopbreak.com/2020/11/22/dolly-partons-christmas-on-the-square-review-not-quite-a-christmas-miracle/)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "source": [
    "Spacy extracts all interesting information from text with this call."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "slideshow": {
     "slide_type": "-"
    }
   },
   "outputs": [],
   "source": [
    "doc = nlp(sample_text)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Let's look at part-of-speech tags. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(Dolly, 'PROPN'), (Parton, 'PROPN'), (is, 'AUX'), (a, 'DET'), (gift, 'NOUN'), (to, 'ADP'), (us, 'PRON'), (all, 'PRON'), (., 'PUNCT'), (\n",
      ", 'SPACE'), (From, 'ADP'), (writing, 'VERB'), (all, 'DET'), (-, 'PUNCT'), (time, 'NOUN'), (great, 'ADJ'), (songs, 'NOUN'), (like, 'ADP'), (“, 'PUNCT'), (Jolene, 'PROPN')]\n"
     ]
    }
   ],
   "source": [
    "print([(token, token.pos_) for token in doc][:20])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "- Often we want to know who did what to whom. \n",
    "- **Named entities** give you this information.  \n",
    "- What are named entities in the text? "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">\n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Dolly Parton\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " is a gift to us all. <br>From writing all-time great songs like “\n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Jolene\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       "” and “I Will Always Love You”, <br>to great performances in films like \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    9 to 5\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       ", to helping fund a COVID-19 vaccine, <br>she’s given us so much. Now, \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Netflix\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       " bring us \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Dolly Parton\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       "’s \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Christmas\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       " on the \n",
       "<mark class=\"entity\" style=\"background: #9cc9cc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Square\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">FAC</span>\n",
       "</mark>\n",
       ", <br>an original musical that stars \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Christine Baranski\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " as a Scrooge-like landowner <br>who threatens to evict an entire town on \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Christmas Eve\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       " to make room for a new mall. <br>Directed and choreographed by the legendary \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Debbie Allen\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " and counting \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Jennifer Lewis\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " <br>and \n",
       "<mark class=\"entity\" style=\"background: #aa9cfc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Parton\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PERSON</span>\n",
       "</mark>\n",
       " herself amongst its cast, \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Christmas\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       " on the \n",
       "<mark class=\"entity\" style=\"background: #9cc9cc; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Square\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">FAC</span>\n",
       "</mark>\n",
       " seems like the perfect movie<br>to save \n",
       "<mark class=\"entity\" style=\"background: #bfe1d9; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Christmas 2020\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">DATE</span>\n",
       "</mark>\n",
       ". 😻 👍🏿</div></span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from spacy import displacy\n",
    "\n",
    "displacy.render(doc, style=\"ent\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Named entities:\n",
      " [('Dolly Parton', 'PERSON'), ('Jolene', 'PERSON'), ('9 to 5', 'DATE'), ('Netflix', 'ORG'), ('Dolly Parton', 'PERSON'), ('Christmas', 'DATE'), ('Square', 'FAC'), ('Christine Baranski', 'PERSON'), ('Christmas Eve', 'DATE'), ('Debbie Allen', 'PERSON'), ('Jennifer Lewis', 'PERSON'), ('Parton', 'PERSON'), ('Christmas', 'DATE'), ('Square', 'FAC'), ('Christmas 2020', 'DATE')]\n",
      "\n",
      "ORG means:  Companies, agencies, institutions, etc.\n",
      "\n",
      "PERSON means:  People, including fictional\n",
      "\n",
      "DATE means:  Absolute or relative dates or periods\n"
     ]
    }
   ],
   "source": [
    "print(\"Named entities:\\n\", [(ent.text, ent.label_) for ent in doc.ents])\n",
    "print(\"\\nORG means: \", spacy.explain(\"ORG\"))\n",
    "print(\"\\nPERSON means: \", spacy.explain(\"PERSON\"))\n",
    "print(\"\\nDATE means: \", spacy.explain(\"DATE\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "###  An example from a project \n",
    "\n",
    "Goal: Extract and visualize inter-corporate relationships from disclosed annual 10-K reports of public companies. \n",
    "\n",
    "[Source for the text below.](https://www.bbc.com/news/business-39875417)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "text = (\n",
    "    \"Heavy hitters, including Microsoft and Google, \"\n",
    "    \"are competing for customers in cloud services with the likes of IBM and Salesforce.\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Heavy hitters, including \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Microsoft\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       " and \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Google\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       ", are competing for customers in cloud services with the likes of \n",
       "<mark class=\"entity\" style=\"background: #7aecec; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    IBM\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">ORG</span>\n",
       "</mark>\n",
       " and \n",
       "<mark class=\"entity\" style=\"background: #bfeeb7; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
       "    Salesforce\n",
       "    <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">PRODUCT</span>\n",
       "</mark>\n",
       ".</div></span>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Named entities:\n",
      " [('Microsoft', 'ORG'), ('Google', 'ORG'), ('IBM', 'ORG'), ('Salesforce', 'PRODUCT')]\n"
     ]
    }
   ],
   "source": [
    "doc = nlp(text)\n",
    "displacy.render(doc, style=\"ent\")\n",
    "print(\"Named entities:\\n\", [(ent.text, ent.label_) for ent in doc.ents])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "If you want emoji identification support install [`spacymoji`](https://pypi.org/project/spacymoji/) in the course environment. \n",
    "\n",
    "```\n",
    "pip install spacymoji\n",
    "```\n",
    "\n",
    "After installing `spacymoji`, if it's still complaining about module not found, my guess is that you do not have `pip` installed in your `conda` environment. Go to your course `conda` environment install `pip` and install the `spacymoji` package in the environment using the `pip` you just installed in the current environment. \n",
    "\n",
    "```\n",
    "conda install pip\n",
    "YOUR_MINICONDA_PATH/miniconda3/envs/cpsc330/bin/pip install spacymoji\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "from spacymoji import Emoji\n",
    "\n",
    "nlp.add_pipe(\"emoji\", first=True);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "Does the text have any emojis? If yes, extract the description. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('😻', 138, 'smiling cat with heart-eyes'),\n",
       " ('👍🏿', 139, 'thumbs up dark skin tone')]"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "doc = nlp(sample_text)\n",
    "doc._.emoji"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br><br><br><br>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Simple feature engineering for our problem. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "import en_core_web_md\n",
    "import spacy\n",
    "\n",
    "nlp = en_core_web_md.load()\n",
    "from spacymoji import Emoji\n",
    "\n",
    "nlp.add_pipe(\"emoji\", first=True)\n",
    "\n",
    "def get_relative_length(text, TWITTER_ALLOWED_CHARS=280.0):\n",
    "    \"\"\"\n",
    "    Returns the relative length of text.\n",
    "\n",
    "    Parameters:\n",
    "    ------\n",
    "    text: (str)\n",
    "    the input text\n",
    "\n",
    "    Keyword arguments:\n",
    "    ------\n",
    "    TWITTER_ALLOWED_CHARS: (float)\n",
    "    the denominator for finding relative length\n",
    "\n",
    "    Returns:\n",
    "    -------\n",
    "    relative length of text: (float)\n",
    "\n",
    "    \"\"\"\n",
    "    return len(text) / TWITTER_ALLOWED_CHARS\n",
    "\n",
    "\n",
    "def get_length_in_words(text):\n",
    "    \"\"\"\n",
    "    Returns the length of the text in words.\n",
    "\n",
    "    Parameters:\n",
    "    ------\n",
    "    text: (str)\n",
    "    the input text\n",
    "\n",
    "    Returns:\n",
    "    -------\n",
    "    length of tokenized text: (int)\n",
    "\n",
    "    \"\"\"\n",
    "    return len(nltk.word_tokenize(text))\n",
    "\n",
    "\n",
    "def get_sentiment(text):\n",
    "    \"\"\"\n",
    "    Returns the compound score representing the sentiment: -1 (most extreme negative) and +1 (most extreme positive)\n",
    "    The compound score is a normalized score calculated by summing the valence scores of each word in the lexicon.\n",
    "\n",
    "    Parameters:\n",
    "    ------\n",
    "    text: (str)\n",
    "    the input text\n",
    "\n",
    "    Returns:\n",
    "    -------\n",
    "    sentiment of the text: (str)\n",
    "    \"\"\"\n",
    "    scores = sid.polarity_scores(text)\n",
    "    return scores[\"compound\"]\n",
    "\n",
    "def get_avg_word_length(text):\n",
    "    \"\"\"\n",
    "    Returns the average word length of the given text.\n",
    "\n",
    "    Parameters:\n",
    "    text -- (str)\n",
    "    \"\"\"\n",
    "    words = text.split()\n",
    "    return sum(len(word) for word in words) / len(words)\n",
    "\n",
    "\n",
    "def has_emoji(text):\n",
    "    \"\"\"\n",
    "    Returns the average word length of the given text.\n",
    "\n",
    "    Parameters:\n",
    "    text -- (str)\n",
    "    \"\"\"\n",
    "    doc = nlp(text)\n",
    "    return 1 if doc._.has_emoji else 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt_tab to\n",
      "[nltk_data]     /Users/kvarada/nltk_data...\n",
      "[nltk_data]   Unzipping tokenizers/punkt_tab.zip.\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('punkt_tab')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>UserName</th>\n",
       "      <th>ScreenName</th>\n",
       "      <th>Location</th>\n",
       "      <th>TweetAt</th>\n",
       "      <th>OriginalTweet</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>n_words</th>\n",
       "      <th>vader_sentiment</th>\n",
       "      <th>rel_char_len</th>\n",
       "      <th>average_word_length</th>\n",
       "      <th>all_caps</th>\n",
       "      <th>has_emoji</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1927</th>\n",
       "      <td>1928</td>\n",
       "      <td>46880</td>\n",
       "      <td>Seattle, WA</td>\n",
       "      <td>13-03-2020</td>\n",
       "      <td>While I don't like all of Amazon's choices, to...</td>\n",
       "      <td>Positive</td>\n",
       "      <td>31</td>\n",
       "      <td>-0.1053</td>\n",
       "      <td>0.589286</td>\n",
       "      <td>5.640000</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1068</th>\n",
       "      <td>1069</td>\n",
       "      <td>46021</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13-03-2020</td>\n",
       "      <td>Me: shit buckets, its time to do the weekly s...</td>\n",
       "      <td>Negative</td>\n",
       "      <td>52</td>\n",
       "      <td>-0.2500</td>\n",
       "      <td>0.932143</td>\n",
       "      <td>4.636364</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>803</th>\n",
       "      <td>804</td>\n",
       "      <td>45756</td>\n",
       "      <td>The Outer Limits</td>\n",
       "      <td>12-03-2020</td>\n",
       "      <td>@SecPompeo @realDonaldTrump You mean the plan ...</td>\n",
       "      <td>Neutral</td>\n",
       "      <td>44</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>0.910714</td>\n",
       "      <td>6.741935</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2846</th>\n",
       "      <td>2847</td>\n",
       "      <td>47799</td>\n",
       "      <td>Flagstaff, AZ</td>\n",
       "      <td>15-03-2020</td>\n",
       "      <td>@lauvagrande People who are sick arent panic ...</td>\n",
       "      <td>Extremely Negative</td>\n",
       "      <td>46</td>\n",
       "      <td>-0.8481</td>\n",
       "      <td>0.907143</td>\n",
       "      <td>5.023810</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3768</th>\n",
       "      <td>3769</td>\n",
       "      <td>48721</td>\n",
       "      <td>Montreal, Canada</td>\n",
       "      <td>16-03-2020</td>\n",
       "      <td>Coronavirus Panic: Toilet Paper Is the People...</td>\n",
       "      <td>Negative</td>\n",
       "      <td>21</td>\n",
       "      <td>-0.5106</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>9.846154</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      UserName  ScreenName          Location     TweetAt  \\\n",
       "1927      1928       46880       Seattle, WA  13-03-2020   \n",
       "1068      1069       46021               NaN  13-03-2020   \n",
       "803        804       45756  The Outer Limits  12-03-2020   \n",
       "2846      2847       47799     Flagstaff, AZ  15-03-2020   \n",
       "3768      3769       48721  Montreal, Canada  16-03-2020   \n",
       "\n",
       "                                          OriginalTweet           Sentiment  \\\n",
       "1927  While I don't like all of Amazon's choices, to...            Positive   \n",
       "1068  Me: shit buckets, its time to do the weekly s...            Negative   \n",
       "803   @SecPompeo @realDonaldTrump You mean the plan ...             Neutral   \n",
       "2846  @lauvagrande People who are sick arent panic ...  Extremely Negative   \n",
       "3768  Coronavirus Panic: Toilet Paper Is the People...            Negative   \n",
       "\n",
       "      n_words  vader_sentiment  rel_char_len  average_word_length  all_caps  \\\n",
       "1927       31          -0.1053      0.589286             5.640000         0   \n",
       "1068       52          -0.2500      0.932143             4.636364         0   \n",
       "803        44           0.0000      0.910714             6.741935         0   \n",
       "2846       46          -0.8481      0.907143             5.023810         0   \n",
       "3768       21          -0.5106      0.500000             9.846154         0   \n",
       "\n",
       "      has_emoji  \n",
       "1927          0  \n",
       "1068          0  \n",
       "803           0  \n",
       "2846          0  \n",
       "3768          0  "
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df = train_df.assign(n_words=train_df[\"OriginalTweet\"].apply(get_length_in_words))\n",
    "train_df = train_df.assign(vader_sentiment=train_df[\"OriginalTweet\"].apply(get_sentiment))\n",
    "train_df = train_df.assign(rel_char_len=train_df[\"OriginalTweet\"].apply(get_relative_length))\n",
    "\n",
    "test_df = test_df.assign(n_words=test_df[\"OriginalTweet\"].apply(get_length_in_words))\n",
    "test_df = test_df.assign(vader_sentiment=test_df[\"OriginalTweet\"].apply(get_sentiment))\n",
    "test_df = test_df.assign(rel_char_len=test_df[\"OriginalTweet\"].apply(get_relative_length))\n",
    "\n",
    "\n",
    "train_df = train_df.assign(\n",
    "    average_word_length=train_df[\"OriginalTweet\"].apply(get_avg_word_length)\n",
    ")\n",
    "test_df = test_df.assign(average_word_length=test_df[\"OriginalTweet\"].apply(get_avg_word_length))\n",
    "\n",
    "# whether all letters are uppercase or not (all_caps)\n",
    "train_df = train_df.assign(\n",
    "    all_caps=train_df[\"OriginalTweet\"].apply(lambda x: 1 if x.isupper() else 0)\n",
    ")\n",
    "test_df = test_df.assign(\n",
    "    all_caps=test_df[\"OriginalTweet\"].apply(lambda x: 1 if x.isupper() else 0)\n",
    ")\n",
    "\n",
    "train_df = train_df.assign(has_emoji=train_df[\"OriginalTweet\"].apply(has_emoji))\n",
    "test_df = test_df.assign(has_emoji=test_df[\"OriginalTweet\"].apply(has_emoji))\n",
    "\n",
    "train_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3038, 12)"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_df.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 73,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(train_df['all_caps'] == 1).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = train_df.drop(columns=['Sentiment'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "numeric_features = ['vader_sentiment', \n",
    "                    'rel_char_len', \n",
    "                    'average_word_length']\n",
    "passthrough_features = ['all_caps', 'has_emoji'] \n",
    "text_feature = 'OriginalTweet'\n",
    "drop_features = ['UserName', 'ScreenName', 'Location', 'TweetAt']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor = make_column_transformer(\n",
    "    (StandardScaler(), numeric_features),\n",
    "    (\"passthrough\", passthrough_features), \n",
    "    (CountVectorizer(stop_words='english'), text_feature),\n",
    "    (\"drop\", drop_features)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/b3/g26r0dcx4b35vf3nk31216hc0000gr/T/ipykernel_13054/4158382658.py:26: FutureWarning:\n",
      "\n",
      "Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>fit_time</th>\n",
       "      <th>score_time</th>\n",
       "      <th>test_score</th>\n",
       "      <th>train_score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>dummy</th>\n",
       "      <td>0.001 (+/- 0.001)</td>\n",
       "      <td>0.001 (+/- 0.001)</td>\n",
       "      <td>0.280 (+/- 0.001)</td>\n",
       "      <td>0.280 (+/- 0.000)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>logistic regression</th>\n",
       "      <td>0.278 (+/- 0.019)</td>\n",
       "      <td>0.008 (+/- 0.000)</td>\n",
       "      <td>0.414 (+/- 0.012)</td>\n",
       "      <td>0.999 (+/- 0.000)</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>LR (more feats)</th>\n",
       "      <td>0.246 (+/- 0.015)</td>\n",
       "      <td>0.009 (+/- 0.000)</td>\n",
       "      <td>0.690 (+/- 0.007)</td>\n",
       "      <td>0.998 (+/- 0.001)</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                              fit_time         score_time         test_score  \\\n",
       "dummy                0.001 (+/- 0.001)  0.001 (+/- 0.001)  0.280 (+/- 0.001)   \n",
       "logistic regression  0.278 (+/- 0.019)  0.008 (+/- 0.000)  0.414 (+/- 0.012)   \n",
       "LR (more feats)      0.246 (+/- 0.015)  0.009 (+/- 0.000)  0.690 (+/- 0.007)   \n",
       "\n",
       "                           train_score  \n",
       "dummy                0.280 (+/- 0.000)  \n",
       "logistic regression  0.999 (+/- 0.000)  \n",
       "LR (more feats)      0.998 (+/- 0.001)  "
      ]
     },
     "execution_count": 77,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))\n",
    "results[\"LR (more feats)\"] = mean_std_cross_val_scores(\n",
    "    pipe, X_train, y_train, return_train_score=True, scoring=scoring_metrics\n",
    ")\n",
    "pd.DataFrame(results).T"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<style>#sk-container-id-3 {\n",
       "  /* Definition of color scheme common for light and dark mode */\n",
       "  --sklearn-color-text: black;\n",
       "  --sklearn-color-line: gray;\n",
       "  /* Definition of color scheme for unfitted estimators */\n",
       "  --sklearn-color-unfitted-level-0: #fff5e6;\n",
       "  --sklearn-color-unfitted-level-1: #f6e4d2;\n",
       "  --sklearn-color-unfitted-level-2: #ffe0b3;\n",
       "  --sklearn-color-unfitted-level-3: chocolate;\n",
       "  /* Definition of color scheme for fitted estimators */\n",
       "  --sklearn-color-fitted-level-0: #f0f8ff;\n",
       "  --sklearn-color-fitted-level-1: #d4ebff;\n",
       "  --sklearn-color-fitted-level-2: #b3dbfd;\n",
       "  --sklearn-color-fitted-level-3: cornflowerblue;\n",
       "\n",
       "  /* Specific color for light theme */\n",
       "  --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
       "  --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
       "  --sklearn-color-icon: #696969;\n",
       "\n",
       "  @media (prefers-color-scheme: dark) {\n",
       "    /* Redefinition of color scheme for dark theme */\n",
       "    --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
       "    --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
       "    --sklearn-color-icon: #878787;\n",
       "  }\n",
       "}\n",
       "\n",
       "#sk-container-id-3 {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 pre {\n",
       "  padding: 0;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 input.sk-hidden--visually {\n",
       "  border: 0;\n",
       "  clip: rect(1px 1px 1px 1px);\n",
       "  clip: rect(1px, 1px, 1px, 1px);\n",
       "  height: 1px;\n",
       "  margin: -1px;\n",
       "  overflow: hidden;\n",
       "  padding: 0;\n",
       "  position: absolute;\n",
       "  width: 1px;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-dashed-wrapped {\n",
       "  border: 1px dashed var(--sklearn-color-line);\n",
       "  margin: 0 0.4em 0.5em 0.4em;\n",
       "  box-sizing: border-box;\n",
       "  padding-bottom: 0.4em;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-container {\n",
       "  /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
       "     but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
       "     so we also need the `!important` here to be able to override the\n",
       "     default hidden behavior on the sphinx rendered scikit-learn.org.\n",
       "     See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
       "  display: inline-block !important;\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-text-repr-fallback {\n",
       "  display: none;\n",
       "}\n",
       "\n",
       "div.sk-parallel-item,\n",
       "div.sk-serial,\n",
       "div.sk-item {\n",
       "  /* draw centered vertical line to link estimators */\n",
       "  background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
       "  background-size: 2px 100%;\n",
       "  background-repeat: no-repeat;\n",
       "  background-position: center center;\n",
       "}\n",
       "\n",
       "/* Parallel-specific style estimator block */\n",
       "\n",
       "#sk-container-id-3 div.sk-parallel-item::after {\n",
       "  content: \"\";\n",
       "  width: 100%;\n",
       "  border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
       "  flex-grow: 1;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-parallel {\n",
       "  display: flex;\n",
       "  align-items: stretch;\n",
       "  justify-content: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  position: relative;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-parallel-item {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-parallel-item:first-child::after {\n",
       "  align-self: flex-end;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-parallel-item:last-child::after {\n",
       "  align-self: flex-start;\n",
       "  width: 50%;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-parallel-item:only-child::after {\n",
       "  width: 0;\n",
       "}\n",
       "\n",
       "/* Serial-specific style estimator block */\n",
       "\n",
       "#sk-container-id-3 div.sk-serial {\n",
       "  display: flex;\n",
       "  flex-direction: column;\n",
       "  align-items: center;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  padding-right: 1em;\n",
       "  padding-left: 1em;\n",
       "}\n",
       "\n",
       "\n",
       "/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
       "clickable and can be expanded/collapsed.\n",
       "- Pipeline and ColumnTransformer use this feature and define the default style\n",
       "- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
       "*/\n",
       "\n",
       "/* Pipeline and ColumnTransformer style (default) */\n",
       "\n",
       "#sk-container-id-3 div.sk-toggleable {\n",
       "  /* Default theme specific background. It is overwritten whether we have a\n",
       "  specific estimator or a Pipeline/ColumnTransformer */\n",
       "  background-color: var(--sklearn-color-background);\n",
       "}\n",
       "\n",
       "/* Toggleable label */\n",
       "#sk-container-id-3 label.sk-toggleable__label {\n",
       "  cursor: pointer;\n",
       "  display: block;\n",
       "  width: 100%;\n",
       "  margin-bottom: 0;\n",
       "  padding: 0.5em;\n",
       "  box-sizing: border-box;\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 label.sk-toggleable__label-arrow:before {\n",
       "  /* Arrow on the left of the label */\n",
       "  content: \"▸\";\n",
       "  float: left;\n",
       "  margin-right: 0.25em;\n",
       "  color: var(--sklearn-color-icon);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {\n",
       "  color: var(--sklearn-color-text);\n",
       "}\n",
       "\n",
       "/* Toggleable content - dropdown */\n",
       "\n",
       "#sk-container-id-3 div.sk-toggleable__content {\n",
       "  max-height: 0;\n",
       "  max-width: 0;\n",
       "  overflow: hidden;\n",
       "  text-align: left;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-toggleable__content.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-toggleable__content pre {\n",
       "  margin: 0.2em;\n",
       "  border-radius: 0.25em;\n",
       "  color: var(--sklearn-color-text);\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-toggleable__content.fitted pre {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
       "  /* Expand drop-down */\n",
       "  max-height: 200px;\n",
       "  max-width: 100%;\n",
       "  overflow: auto;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
       "  content: \"▾\";\n",
       "}\n",
       "\n",
       "/* Pipeline/ColumnTransformer-specific style */\n",
       "\n",
       "#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator-specific style */\n",
       "\n",
       "/* Colorize estimator box */\n",
       "#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-label label.sk-toggleable__label,\n",
       "#sk-container-id-3 div.sk-label label {\n",
       "  /* The background is the default theme color */\n",
       "  color: var(--sklearn-color-text-on-default-background);\n",
       "}\n",
       "\n",
       "/* On hover, darken the color of the background */\n",
       "#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "/* Label box, darken color on hover, fitted */\n",
       "#sk-container-id-3 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
       "  color: var(--sklearn-color-text);\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Estimator label */\n",
       "\n",
       "#sk-container-id-3 div.sk-label label {\n",
       "  font-family: monospace;\n",
       "  font-weight: bold;\n",
       "  display: inline-block;\n",
       "  line-height: 1.2em;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-label-container {\n",
       "  text-align: center;\n",
       "}\n",
       "\n",
       "/* Estimator-specific */\n",
       "#sk-container-id-3 div.sk-estimator {\n",
       "  font-family: monospace;\n",
       "  border: 1px dotted var(--sklearn-color-border-box);\n",
       "  border-radius: 0.25em;\n",
       "  box-sizing: border-box;\n",
       "  margin-bottom: 0.5em;\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-0);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-estimator.fitted {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-0);\n",
       "}\n",
       "\n",
       "/* on hover */\n",
       "#sk-container-id-3 div.sk-estimator:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-2);\n",
       "}\n",
       "\n",
       "#sk-container-id-3 div.sk-estimator.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-2);\n",
       "}\n",
       "\n",
       "/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
       "\n",
       "/* Common style for \"i\" and \"?\" */\n",
       "\n",
       ".sk-estimator-doc-link,\n",
       "a:link.sk-estimator-doc-link,\n",
       "a:visited.sk-estimator-doc-link {\n",
       "  float: right;\n",
       "  font-size: smaller;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1em;\n",
       "  height: 1em;\n",
       "  width: 1em;\n",
       "  text-decoration: none !important;\n",
       "  margin-left: 1ex;\n",
       "  /* unfitted */\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted,\n",
       "a:link.sk-estimator-doc-link.fitted,\n",
       "a:visited.sk-estimator-doc-link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
       ".sk-estimator-doc-link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover,\n",
       "div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
       ".sk-estimator-doc-link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "/* Span, style for the box shown on hovering the info icon */\n",
       ".sk-estimator-doc-link span {\n",
       "  display: none;\n",
       "  z-index: 9999;\n",
       "  position: relative;\n",
       "  font-weight: normal;\n",
       "  right: .2ex;\n",
       "  padding: .5ex;\n",
       "  margin: .5ex;\n",
       "  width: min-content;\n",
       "  min-width: 20ex;\n",
       "  max-width: 50ex;\n",
       "  color: var(--sklearn-color-text);\n",
       "  box-shadow: 2pt 2pt 4pt #999;\n",
       "  /* unfitted */\n",
       "  background: var(--sklearn-color-unfitted-level-0);\n",
       "  border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link.fitted span {\n",
       "  /* fitted */\n",
       "  background: var(--sklearn-color-fitted-level-0);\n",
       "  border: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "\n",
       ".sk-estimator-doc-link:hover span {\n",
       "  display: block;\n",
       "}\n",
       "\n",
       "/* \"?\"-specific style due to the `<a>` HTML tag */\n",
       "\n",
       "#sk-container-id-3 a.estimator_doc_link {\n",
       "  float: right;\n",
       "  font-size: 1rem;\n",
       "  line-height: 1em;\n",
       "  font-family: monospace;\n",
       "  background-color: var(--sklearn-color-background);\n",
       "  border-radius: 1rem;\n",
       "  height: 1rem;\n",
       "  width: 1rem;\n",
       "  text-decoration: none;\n",
       "  /* unfitted */\n",
       "  color: var(--sklearn-color-unfitted-level-1);\n",
       "  border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 a.estimator_doc_link.fitted {\n",
       "  /* fitted */\n",
       "  border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
       "  color: var(--sklearn-color-fitted-level-1);\n",
       "}\n",
       "\n",
       "/* On hover */\n",
       "#sk-container-id-3 a.estimator_doc_link:hover {\n",
       "  /* unfitted */\n",
       "  background-color: var(--sklearn-color-unfitted-level-3);\n",
       "  color: var(--sklearn-color-background);\n",
       "  text-decoration: none;\n",
       "}\n",
       "\n",
       "#sk-container-id-3 a.estimator_doc_link.fitted:hover {\n",
       "  /* fitted */\n",
       "  background-color: var(--sklearn-color-fitted-level-3);\n",
       "}\n",
       "</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;columntransformer&#x27;,\n",
       "                 ColumnTransformer(transformers=[(&#x27;standardscaler&#x27;,\n",
       "                                                  StandardScaler(),\n",
       "                                                  [&#x27;vader_sentiment&#x27;,\n",
       "                                                   &#x27;rel_char_len&#x27;,\n",
       "                                                   &#x27;average_word_length&#x27;]),\n",
       "                                                 (&#x27;passthrough&#x27;, &#x27;passthrough&#x27;,\n",
       "                                                  [&#x27;all_caps&#x27;, &#x27;has_emoji&#x27;]),\n",
       "                                                 (&#x27;countvectorizer&#x27;,\n",
       "                                                  CountVectorizer(stop_words=&#x27;english&#x27;),\n",
       "                                                  &#x27;OriginalTweet&#x27;),\n",
       "                                                 (&#x27;drop&#x27;, &#x27;drop&#x27;,\n",
       "                                                  [&#x27;UserName&#x27;, &#x27;ScreenName&#x27;,\n",
       "                                                   &#x27;Location&#x27;, &#x27;TweetAt&#x27;])])),\n",
       "                (&#x27;logisticregression&#x27;, LogisticRegression(max_iter=1000))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-14\" type=\"checkbox\" ><label for=\"sk-estimator-id-14\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;columntransformer&#x27;,\n",
       "                 ColumnTransformer(transformers=[(&#x27;standardscaler&#x27;,\n",
       "                                                  StandardScaler(),\n",
       "                                                  [&#x27;vader_sentiment&#x27;,\n",
       "                                                   &#x27;rel_char_len&#x27;,\n",
       "                                                   &#x27;average_word_length&#x27;]),\n",
       "                                                 (&#x27;passthrough&#x27;, &#x27;passthrough&#x27;,\n",
       "                                                  [&#x27;all_caps&#x27;, &#x27;has_emoji&#x27;]),\n",
       "                                                 (&#x27;countvectorizer&#x27;,\n",
       "                                                  CountVectorizer(stop_words=&#x27;english&#x27;),\n",
       "                                                  &#x27;OriginalTweet&#x27;),\n",
       "                                                 (&#x27;drop&#x27;, &#x27;drop&#x27;,\n",
       "                                                  [&#x27;UserName&#x27;, &#x27;ScreenName&#x27;,\n",
       "                                                   &#x27;Location&#x27;, &#x27;TweetAt&#x27;])])),\n",
       "                (&#x27;logisticregression&#x27;, LogisticRegression(max_iter=1000))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-15\" type=\"checkbox\" ><label for=\"sk-estimator-id-15\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;columntransformer: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for columntransformer: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[(&#x27;standardscaler&#x27;, StandardScaler(),\n",
       "                                 [&#x27;vader_sentiment&#x27;, &#x27;rel_char_len&#x27;,\n",
       "                                  &#x27;average_word_length&#x27;]),\n",
       "                                (&#x27;passthrough&#x27;, &#x27;passthrough&#x27;,\n",
       "                                 [&#x27;all_caps&#x27;, &#x27;has_emoji&#x27;]),\n",
       "                                (&#x27;countvectorizer&#x27;,\n",
       "                                 CountVectorizer(stop_words=&#x27;english&#x27;),\n",
       "                                 &#x27;OriginalTweet&#x27;),\n",
       "                                (&#x27;drop&#x27;, &#x27;drop&#x27;,\n",
       "                                 [&#x27;UserName&#x27;, &#x27;ScreenName&#x27;, &#x27;Location&#x27;,\n",
       "                                  &#x27;TweetAt&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-16\" type=\"checkbox\" ><label for=\"sk-estimator-id-16\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">standardscaler</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;vader_sentiment&#x27;, &#x27;rel_char_len&#x27;, &#x27;average_word_length&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-17\" type=\"checkbox\" ><label for=\"sk-estimator-id-17\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-18\" type=\"checkbox\" ><label for=\"sk-estimator-id-18\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">passthrough</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;all_caps&#x27;, &#x27;has_emoji&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-19\" type=\"checkbox\" ><label for=\"sk-estimator-id-19\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">passthrough</label><div class=\"sk-toggleable__content fitted\"><pre>passthrough</pre></div> </div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-20\" type=\"checkbox\" ><label for=\"sk-estimator-id-20\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">countvectorizer</label><div class=\"sk-toggleable__content fitted\"><pre>OriginalTweet</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-21\" type=\"checkbox\" ><label for=\"sk-estimator-id-21\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;CountVectorizer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\">?<span>Documentation for CountVectorizer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>CountVectorizer(stop_words=&#x27;english&#x27;)</pre></div> </div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-22\" type=\"checkbox\" ><label for=\"sk-estimator-id-22\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">drop</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;UserName&#x27;, &#x27;ScreenName&#x27;, &#x27;Location&#x27;, &#x27;TweetAt&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-23\" type=\"checkbox\" ><label for=\"sk-estimator-id-23\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">drop</label><div class=\"sk-toggleable__content fitted\"><pre>drop</pre></div> </div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-24\" type=\"checkbox\" ><label for=\"sk-estimator-id-24\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;LogisticRegression<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html\">?<span>Documentation for LogisticRegression</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>LogisticRegression(max_iter=1000)</pre></div> </div></div></div></div></div></div>"
      ],
      "text/plain": [
       "Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(transformers=[('standardscaler',\n",
       "                                                  StandardScaler(),\n",
       "                                                  ['vader_sentiment',\n",
       "                                                   'rel_char_len',\n",
       "                                                   'average_word_length']),\n",
       "                                                 ('passthrough', 'passthrough',\n",
       "                                                  ['all_caps', 'has_emoji']),\n",
       "                                                 ('countvectorizer',\n",
       "                                                  CountVectorizer(stop_words='english'),\n",
       "                                                  'OriginalTweet'),\n",
       "                                                 ('drop', 'drop',\n",
       "                                                  ['UserName', 'ScreenName',\n",
       "                                                   'Location', 'TweetAt'])])),\n",
       "                ('logisticregression', LogisticRegression(max_iter=1000))])"
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_feats = pipe.named_steps['columntransformer'].named_transformers_['countvectorizer'].get_feature_names_out().tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
    "feat_names = numeric_features + passthrough_features + cv_feats"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "coefs = pipe.named_steps['logisticregression'].coef_[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>features</th>\n",
       "      <th>coefficients</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>vader_sentiment</td>\n",
       "      <td>-6.167241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11331</th>\n",
       "      <td>won</td>\n",
       "      <td>-1.384111</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2551</th>\n",
       "      <td>coronapocalypse</td>\n",
       "      <td>-0.817034</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2214</th>\n",
       "      <td>closed</td>\n",
       "      <td>-0.754165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8661</th>\n",
       "      <td>retail</td>\n",
       "      <td>-0.729109</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9862</th>\n",
       "      <td>stupid</td>\n",
       "      <td>1.157157</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3299</th>\n",
       "      <td>don</td>\n",
       "      <td>1.162007</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4879</th>\n",
       "      <td>hell</td>\n",
       "      <td>1.312696</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3129</th>\n",
       "      <td>die</td>\n",
       "      <td>1.365420</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7504</th>\n",
       "      <td>panic</td>\n",
       "      <td>1.539459</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>11664 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              features  coefficients\n",
       "0      vader_sentiment     -6.167241\n",
       "11331              won     -1.384111\n",
       "2551   coronapocalypse     -0.817034\n",
       "2214            closed     -0.754165\n",
       "8661            retail     -0.729109\n",
       "...                ...           ...\n",
       "9862            stupid      1.157157\n",
       "3299               don      1.162007\n",
       "4879              hell      1.312696\n",
       "3129               die      1.365420\n",
       "7504             panic      1.539459\n",
       "\n",
       "[11664 rows x 2 columns]"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(\n",
    "    data={\n",
    "        \"features\": feat_names,\n",
    "        \"coefficients\": coefs,\n",
    "    }\n",
    ")\n",
    "df.sort_values('coefficients')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We get some improvements with our engineered features! "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<br><br><br><br>"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda env:cpsc330]",
   "language": "python",
   "name": "conda-env-cpsc330-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.0"
  },
  "vscode": {
   "interpreter": {
    "hash": "f821000d0c0da66e5bcde88c37d59c8e0de03b40667fb62009a8148ca49465a0"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}