Appendix A: Demo of feature engineering for text data#

Import#

We will be using Covid tweets dataset for this.

import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr
import pandas as pd
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)
sys.path.append(os.path.join(os.path.abspath(".."), "code"))
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
DATA_DIR = os.path.join(os.path.abspath(".."), "data/")
from sklearn.svm import SVC
df = pd.read_csv(DATA_DIR + 'Corona_NLP_test.csv')
df['Sentiment'].value_counts()
Sentiment
Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: count, dtype: int64
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
train_df
UserName ScreenName Location TweetAt OriginalTweet Sentiment
1927 1928 46880 Seattle, WA 13-03-2020 While I don't like all of Amazon's choices, to... Positive
1068 1069 46021 NaN 13-03-2020 Me: shit buckets, it’s time to do the weekly s... Negative
803 804 45756 The Outer Limits 12-03-2020 @SecPompeo @realDonaldTrump You mean the plan ... Neutral
2846 2847 47799 Flagstaff, AZ 15-03-2020 @lauvagrande People who are sick aren’t panic ... Extremely Negative
3768 3769 48721 Montreal, Canada 16-03-2020 Coronavirus Panic: Toilet Paper Is the “People... Negative
... ... ... ... ... ... ...
1122 1123 46075 NaN 13-03-2020 Photos of our local grocery store shelves—wher... Extremely Positive
1346 1347 46299 Toronto 13-03-2020 Just went to the the grocery store (Highland F... Positive
3454 3455 48407 Houston, TX 16-03-2020 Real talk though. Am I the only one spending h... Neutral
3437 3438 48390 Washington, DC 16-03-2020 The supermarket business is booming! #COVID2019 Neutral
3582 3583 48535 St James' Park, Newcastle 16-03-2020 Evening All Here s the story on the and the im... Positive

3038 rows × 6 columns

train_df.columns
Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')
train_df['Location'].value_counts()
Location
United States                     63
London, England                   37
Los Angeles, CA                   30
New York, NY                      29
Washington, DC                    29
                                  ..
Suburb of Chicago                  1
philippines                        1
Dont ask for freedom, take it.     1
Windsor Heights, IA                1
St James' Park, Newcastle          1
Name: count, Length: 1441, dtype: int64
X_train, y_train = train_df[['OriginalTweet', 'Location']], train_df['Sentiment']
X_test, y_test = test_df[['OriginalTweet', 'Location']], test_df['Sentiment']
y_train.value_counts()
Sentiment
Negative              852
Positive              743
Neutral               501
Extremely Negative    472
Extremely Positive    470
Name: count, dtype: int64
scoring_metrics = 'accuracy'
results = {}
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

Dummy classifier#

dummy = DummyClassifier()
results["dummy"] = mean_std_cross_val_scores(
    dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T
fit_time score_time test_score train_score
dummy 0.001 (+/- 0.000) 0.000 (+/- 0.000) 0.280 (+/- 0.001) 0.280 (+/- 0.000)

Bag-of-words model#

from sklearn.feature_extraction.text import CountVectorizer
pipe = make_pipeline(CountVectorizer(stop_words='english'), 
                     LogisticRegression(max_iter=1000))
results["logistic regression"] = mean_std_cross_val_scores(
    pipe, X_train['OriginalTweet'], y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T
fit_time score_time test_score train_score
dummy 0.001 (+/- 0.000) 0.000 (+/- 0.000) 0.280 (+/- 0.001) 0.280 (+/- 0.000)
logistic regression 0.306 (+/- 0.015) 0.008 (+/- 0.000) 0.414 (+/- 0.012) 0.999 (+/- 0.000)

Is it possible to further improve the scores?#

  • How about adding new features based on our intuitions? Let’s extract our own features that might be useful for this prediction task. In other words, let’s carry out feature engineering.

  • The code below adds some very basic length-related and sentiment features. We will be using a popular library called nltk for this exercise. If you have successfully created the course conda environment on your machine, you should already have this package in the environment.

  • How do we extract interesting information from text?

  • We use pre-trained models!

  • A couple of popular libraries which include such pre-trained models.

  • nltk

conda install -c anaconda nltk 
  • spaCy

conda install -c conda-forge spacy

For emoji support:

pip install spacymoji
  • You also need to download the language model which contains all the pre-trained models. For that run the following in your course conda environment or here.

import spacy

# !python -m spacy download en_core_web_md
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/base_events.py", line 640, in run_forever
    self._run_once()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/base_events.py", line 1992, in _run_once
    handle._run()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
    await self.process_one()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one
    await dispatch(*args)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
    await result
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request
    await super().execute_request(stream, ident, parent)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
    reply_content = await reply_content
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute
    res = shell.run_cell(
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
    result = self._run_cell(
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
    result = runner(coro)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner
    coro.send(None)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/j6/dt88trtd17lf726d55bq16c40000gr/T/ipykernel_86208/456904786.py", line 1, in <module>
    import spacy
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/__init__.py", line 6, in <module>
    from .errors import setup_default_warnings
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/errors.py", line 3, in <module>
    from .compat import Literal
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/compat.py", line 4, in <module>
    from thinc.util import copy_array
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/__init__.py", line 5, in <module>
    from .config import registry
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/config.py", line 5, in <module>
    from .types import Decorator
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/types.py", line 25, in <module>
    from .compat import cupy, has_cupy
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/compat.py", line 35, in <module>
    import torch
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/__init__.py", line 1477, in <module>
    from .functional import *  # noqa: F403
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/functional.py", line 9, in <module>
    import torch.nn.functional as F
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/__init__.py", line 1, in <module>
    from .modules import *  # noqa: F403
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/__init__.py", line 35, in <module>
    from .transformer import TransformerEncoder, TransformerDecoder, \
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/transformer.py", line 20, in <module>
    device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),
/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/transformer.py:20: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1711403226120/work/torch/csrc/utils/tensor_numpy.cpp:84.)
  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),
import nltk

nltk.download("punkt")
[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
True
nltk.download("vader_lexicon")
nltk.download("punkt")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mathias/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
s = "CPSC 330 students are smart, sweet, and funny."
print(sid.polarity_scores(s))
{'neg': 0.0, 'neu': 0.368, 'pos': 0.632, 'compound': 0.8225}
s = "CPSC 330 students are tired because of all the hard work they have been doing."
print(sid.polarity_scores(s))
{'neg': 0.249, 'neu': 0.751, 'pos': 0.0, 'compound': -0.5106}

spaCy#

A useful package for text processing and feature extraction

  • Active development: https://github.com/explosion/spaCy

  • Interactive lessons by Ines Montani: https://course.spacy.io/en/

  • Good documentation, easy to use, and customizable.

To run the code below, you have to download the pretrained model in the course environment.

python -m spacy download en_core_web_md

import spacy

nlp = spacy.load("en_core_web_md")
sample_text = """Dolly Parton is a gift to us all. 
From writing all-time great songs like “Jolene” and “I Will Always Love You”, 
to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine, 
she’s given us so much. Now, Netflix bring us Dolly Parton’s Christmas on the Square, 
an original musical that stars Christine Baranski as a Scrooge-like landowner 
who threatens to evict an entire town on Christmas Eve to make room for a new mall. 
Directed and choreographed by the legendary Debbie Allen and counting Jennifer Lewis 
and Parton herself amongst its cast, Christmas on the Square seems like the perfect movie
to save Christmas 2020. 😻 👍🏿"""

# [Adapted from here.](https://thepopbreak.com/2020/11/22/dolly-partons-christmas-on-the-square-review-not-quite-a-christmas-miracle/)

Spacy extracts all interesting information from text with this call.

doc = nlp(sample_text)

Let’s look at part-of-speech tags.

print([(token, token.pos_) for token in doc][:20])
[(Dolly, 'PROPN'), (Parton, 'PROPN'), (is, 'AUX'), (a, 'DET'), (gift, 'NOUN'), (to, 'ADP'), (us, 'PRON'), (all, 'PRON'), (., 'PUNCT'), (
, 'SPACE'), (From, 'ADP'), (writing, 'VERB'), (all, 'DET'), (-, 'PUNCT'), (time, 'NOUN'), (great, 'ADJ'), (songs, 'NOUN'), (like, 'ADP'), (“, 'PUNCT'), (Jolene, 'PROPN')]
  • Often we want to know who did what to whom.

  • Named entities give you this information.

  • What are named entities in the text?

from spacy import displacy

displacy.render(doc, style="ent")
Dolly Parton PERSON is a gift to us all.
From writing all-time great songs like “ Jolene PERSON ” and “ I Will Always Love You WORK_OF_ART ”,
to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine,
she’s given us so much. Now, Netflix ORG bring us Dolly Parton’s PERSON Christmas DATE on the Square,
an original musical that stars Christine Baranski PERSON as a Scrooge-like landowner
who threatens to evict an entire town on Christmas Eve DATE to make room for a new mall.
Directed and choreographed by the legendary Debbie Allen PERSON and counting Jennifer Lewis PERSON
and Parton PERSON herself amongst its cast, Christmas DATE on the Square seems like the perfect movie
to save Christmas 2020 DATE . 😻 👍🏿
print("Named entities:\n", [(ent.text, ent.label_) for ent in doc.ents])
print("\nORG means: ", spacy.explain("ORG"))
print("\nPERSON means: ", spacy.explain("PERSON"))
print("\nDATE means: ", spacy.explain("DATE"))
Named entities:
 [('Dolly Parton', 'PERSON'), ('Jolene', 'PERSON'), ('I Will Always Love You', 'WORK_OF_ART'), ('Netflix', 'ORG'), ('Dolly Parton’s', 'PERSON'), ('Christmas', 'DATE'), ('Christine Baranski', 'PERSON'), ('Christmas Eve', 'DATE'), ('Debbie Allen', 'PERSON'), ('Jennifer Lewis', 'PERSON'), ('Parton', 'PERSON'), ('Christmas', 'DATE'), ('Christmas 2020', 'DATE')]

ORG means:  Companies, agencies, institutions, etc.

PERSON means:  People, including fictional

DATE means:  Absolute or relative dates or periods

An example from a project#

Goal: Extract and visualize inter-corporate relationships from disclosed annual 10-K reports of public companies.

Source for the text below.

text = (
    "Heavy hitters, including Microsoft and Google, "
    "are competing for customers in cloud services with the likes of IBM and Salesforce."
)
doc = nlp(text)
displacy.render(doc, style="ent")
print("Named entities:\n", [(ent.text, ent.label_) for ent in doc.ents])
Heavy hitters, including Microsoft ORG and Google ORG , are competing for customers in cloud services with the likes of IBM ORG and Salesforce PERSON .
Named entities:
 [('Microsoft', 'ORG'), ('Google', 'ORG'), ('IBM', 'ORG'), ('Salesforce', 'PERSON')]

If you want emoji identification support install spacymoji in the course environment.

pip install spacymoji

After installing spacymoji, if it’s still complaining about module not found, my guess is that you do not have pip installed in your conda environment. Go to your course conda environment install pip and install the spacymoji package in the environment using the pip you just installed in the current environment.

conda install pip
YOUR_MINICONDA_PATH/miniconda3/envs/cpsc330/bin/pip install spacymoji
from spacymoji import Emoji

nlp.add_pipe("emoji", first=True);

Does the text have any emojis? If yes, extract the description.

doc = nlp(sample_text)
doc._.emoji
[('😻', 138, 'smiling cat with heart-eyes'),
 ('👍🏿', 139, 'thumbs up dark skin tone')]





Simple feature engineering for our problem.#

import en_core_web_md
import spacy

nlp = en_core_web_md.load()
from spacymoji import Emoji

nlp.add_pipe("emoji", first=True)

def get_relative_length(text, TWITTER_ALLOWED_CHARS=280.0):
    """
    Returns the relative length of text.

    Parameters:
    ------
    text: (str)
    the input text

    Keyword arguments:
    ------
    TWITTER_ALLOWED_CHARS: (float)
    the denominator for finding relative length

    Returns:
    -------
    relative length of text: (float)

    """
    return len(text) / TWITTER_ALLOWED_CHARS


def get_length_in_words(text):
    """
    Returns the length of the text in words.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    length of tokenized text: (int)

    """
    return len(nltk.word_tokenize(text))


def get_sentiment(text):
    """
    Returns the compound score representing the sentiment: -1 (most extreme negative) and +1 (most extreme positive)
    The compound score is a normalized score calculated by summing the valence scores of each word in the lexicon.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    sentiment of the text: (str)
    """
    scores = sid.polarity_scores(text)
    return scores["compound"]

def get_avg_word_length(text):
    """
    Returns the average word length of the given text.

    Parameters:
    text -- (str)
    """
    words = text.split()
    return sum(len(word) for word in words) / len(words)


def has_emoji(text):
    """
    Returns the average word length of the given text.

    Parameters:
    text -- (str)
    """
    doc = nlp(text)
    return 1 if doc._.has_emoji else 0
import nltk
nltk.download('punkt_tab')
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mathias/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
True
train_df = train_df.assign(n_words=train_df["OriginalTweet"].apply(get_length_in_words))
train_df = train_df.assign(vader_sentiment=train_df["OriginalTweet"].apply(get_sentiment))
train_df = train_df.assign(rel_char_len=train_df["OriginalTweet"].apply(get_relative_length))

test_df = test_df.assign(n_words=test_df["OriginalTweet"].apply(get_length_in_words))
test_df = test_df.assign(vader_sentiment=test_df["OriginalTweet"].apply(get_sentiment))
test_df = test_df.assign(rel_char_len=test_df["OriginalTweet"].apply(get_relative_length))


train_df = train_df.assign(
    average_word_length=train_df["OriginalTweet"].apply(get_avg_word_length)
)
test_df = test_df.assign(average_word_length=test_df["OriginalTweet"].apply(get_avg_word_length))

# whether all letters are uppercase or not (all_caps)
train_df = train_df.assign(
    all_caps=train_df["OriginalTweet"].apply(lambda x: 1 if x.isupper() else 0)
)
test_df = test_df.assign(
    all_caps=test_df["OriginalTweet"].apply(lambda x: 1 if x.isupper() else 0)
)

train_df = train_df.assign(has_emoji=train_df["OriginalTweet"].apply(has_emoji))
test_df = test_df.assign(has_emoji=test_df["OriginalTweet"].apply(has_emoji))

train_df.head()
UserName ScreenName Location TweetAt OriginalTweet Sentiment n_words vader_sentiment rel_char_len average_word_length all_caps has_emoji
1927 1928 46880 Seattle, WA 13-03-2020 While I don't like all of Amazon's choices, to... Positive 31 -0.1053 0.589286 5.640000 0 0
1068 1069 46021 NaN 13-03-2020 Me: shit buckets, it’s time to do the weekly s... Negative 52 -0.2500 0.932143 4.636364 0 0
803 804 45756 The Outer Limits 12-03-2020 @SecPompeo @realDonaldTrump You mean the plan ... Neutral 44 0.0000 0.910714 6.741935 0 0
2846 2847 47799 Flagstaff, AZ 15-03-2020 @lauvagrande People who are sick aren’t panic ... Extremely Negative 46 -0.8481 0.907143 5.023810 0 0
3768 3769 48721 Montreal, Canada 16-03-2020 Coronavirus Panic: Toilet Paper Is the “People... Negative 21 -0.5106 0.500000 9.846154 0 0
train_df.shape
(3038, 12)
(train_df['all_caps'] == 1).sum()
np.int64(0)
X_train = train_df.drop(columns=['Sentiment'])
numeric_features = ['vader_sentiment', 
                    'rel_char_len', 
                    'average_word_length']
passthrough_features = ['all_caps', 'has_emoji'] 
text_feature = 'OriginalTweet'
drop_features = ['UserName', 'ScreenName', 'Location', 'TweetAt']
preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    ("passthrough", passthrough_features), 
    (CountVectorizer(stop_words='english'), text_feature),
    ("drop", drop_features)
)
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
results["LR (more feats)"] = mean_std_cross_val_scores(
    pipe, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T
fit_time score_time test_score train_score
dummy 0.001 (+/- 0.000) 0.000 (+/- 0.000) 0.280 (+/- 0.001) 0.280 (+/- 0.000)
logistic regression 0.306 (+/- 0.015) 0.008 (+/- 0.000) 0.414 (+/- 0.012) 0.999 (+/- 0.000)
LR (more feats) 0.296 (+/- 0.021) 0.010 (+/- 0.001) 0.690 (+/- 0.007) 0.998 (+/- 0.001)
pipe.fit(X_train, y_train)
Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('standardscaler',
                                                  StandardScaler(),
                                                  ['vader_sentiment',
                                                   'rel_char_len',
                                                   'average_word_length']),
                                                 ('passthrough', 'passthrough',
                                                  ['all_caps', 'has_emoji']),
                                                 ('countvectorizer',
                                                  CountVectorizer(stop_words='english'),
                                                  'OriginalTweet'),
                                                 ('drop', 'drop',
                                                  ['UserName', 'ScreenName',
                                                   'Location', 'TweetAt'])])),
                ('logisticregression', LogisticRegression(max_iter=1000))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
cv_feats = pipe.named_steps['columntransformer'].named_transformers_['countvectorizer'].get_feature_names_out().tolist()
feat_names = numeric_features + passthrough_features + cv_feats
coefs = pipe.named_steps['logisticregression'].coef_[0]
df = pd.DataFrame(
    data={
        "features": feat_names,
        "coefficients": coefs,
    }
)
df.sort_values('coefficients')
features coefficients
0 vader_sentiment -6.167241
11331 won -1.384111
2551 coronapocalypse -0.817034
2214 closed -0.754165
8661 retail -0.729109
... ... ...
9862 stupid 1.157157
3299 don 1.162007
4879 hell 1.312696
3129 die 1.365420
7504 panic 1.539459

11664 rows × 2 columns

We get some improvements with our engineered features!