Appendix A: Demo of feature engineering for text data

Appendix A: Demo of feature engineering for text data#

Import#

We will be using Covid tweets dataset for this.

import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import numpy.random as npr
import pandas as pd
from sklearn.compose import (
    ColumnTransformer,
    TransformedTargetRegressor,
    make_column_transformer,
)
sys.path.append(os.path.join(os.path.abspath(".."), "code"))
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV
from sklearn.metrics import make_scorer, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
DATA_DIR = os.path.join(os.path.abspath(".."), "data/")
from sklearn.svm import SVC

df = pd.read_csv(DATA_DIR + 'Corona_NLP_test.csv')
df['Sentiment'].value_counts()

Sentiment
Negative              1041
Positive               947
Neutral                619
Extremely Positive     599
Extremely Negative     592
Name: count, dtype: int64

train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)

train_df

	UserName	ScreenName	Location	TweetAt	OriginalTweet	Sentiment
1927	1928	46880	Seattle, WA	13-03-2020	While I don't like all of Amazon's choices, to...	Positive
1068	1069	46021	NaN	13-03-2020	Me: shit buckets, its time to do the weekly s...	Negative
803	804	45756	The Outer Limits	12-03-2020	@SecPompeo @realDonaldTrump You mean the plan ...	Neutral
2846	2847	47799	Flagstaff, AZ	15-03-2020	@lauvagrande People who are sick arent panic ...	Extremely Negative
3768	3769	48721	Montreal, Canada	16-03-2020	Coronavirus Panic: Toilet Paper Is the People...	Negative
...	...	...	...	...	...	...
1122	1123	46075	NaN	13-03-2020	Photos of our local grocery store shelveswher...	Extremely Positive
1346	1347	46299	Toronto	13-03-2020	Just went to the the grocery store (Highland F...	Positive
3454	3455	48407	Houston, TX	16-03-2020	Real talk though. Am I the only one spending h...	Neutral
3437	3438	48390	Washington, DC	16-03-2020	The supermarket business is booming! #COVID2019	Neutral
3582	3583	48535	St James' Park, Newcastle	16-03-2020	Evening All Here s the story on the and the im...	Positive

3038 rows × 6 columns

train_df.columns

Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',
       'Sentiment'],
      dtype='object')

train_df['Location'].value_counts()

Location
United States                     63
London, England                   37
Los Angeles, CA                   30
New York, NY                      29
Washington, DC                    29
                                  ..
Suburb of Chicago                  1
philippines                        1
Dont ask for freedom, take it.     1
Windsor Heights, IA                1
St James' Park, Newcastle          1
Name: count, Length: 1441, dtype: int64

X_train, y_train = train_df[['OriginalTweet', 'Location']], train_df['Sentiment']
X_test, y_test = test_df[['OriginalTweet', 'Location']], test_df['Sentiment']

y_train.value_counts()

Sentiment
Negative              852
Positive              743
Neutral               501
Extremely Negative    472
Extremely Positive    470
Name: count, dtype: int64

scoring_metrics = 'accuracy'

results = {}

def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

Dummy classifier#

dummy = DummyClassifier()
results["dummy"] = mean_std_cross_val_scores(
    dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

	fit_time	score_time	test_score	train_score
dummy	0.001 (+/- 0.000)	0.000 (+/- 0.000)	0.280 (+/- 0.001)	0.280 (+/- 0.000)

Bag-of-words model#

from sklearn.feature_extraction.text import CountVectorizer
pipe = make_pipeline(CountVectorizer(stop_words='english'), 
                     LogisticRegression(max_iter=1000))
results["logistic regression"] = mean_std_cross_val_scores(
    pipe, X_train['OriginalTweet'], y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

	fit_time	score_time	test_score	train_score
dummy	0.001 (+/- 0.000)	0.000 (+/- 0.000)	0.280 (+/- 0.001)	0.280 (+/- 0.000)
logistic regression	0.306 (+/- 0.015)	0.008 (+/- 0.000)	0.414 (+/- 0.012)	0.999 (+/- 0.000)

Is it possible to further improve the scores?#

How about adding new features based on our intuitions? Let’s extract our own features that might be useful for this prediction task. In other words, let’s carry out feature engineering.
The code below adds some very basic length-related and sentiment features. We will be using a popular library called nltk for this exercise. If you have successfully created the course conda environment on your machine, you should already have this package in the environment.

How do we extract interesting information from text?
We use pre-trained models!

A couple of popular libraries which include such pre-trained models.
nltk

conda install -c anaconda nltk 

spaCy

conda install -c conda-forge spacy

For emoji support:

pip install spacymoji

You also need to download the language model which contains all the pre-trained models. For that run the following in your course conda environment or here.

import spacy

# !python -m spacy download en_core_web_md

A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/base_events.py", line 640, in run_forever
    self._run_once()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/base_events.py", line 1992, in _run_once
    handle._run()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue
    await self.process_one()
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in process_one
    await dispatch(*args)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell
    await result
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 362, in execute_request
    await super().execute_request(stream, ident, parent)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 778, in execute_request
    reply_content = await reply_content
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 449, in do_execute
    res = shell.run_cell(
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell
    return super().run_cell(*args, **kwargs)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell
    result = self._run_cell(
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell
    result = runner(coro)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner
    coro.send(None)
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/j6/dt88trtd17lf726d55bq16c40000gr/T/ipykernel_86208/456904786.py", line 1, in <module>
    import spacy
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/__init__.py", line 6, in <module>
    from .errors import setup_default_warnings
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/errors.py", line 3, in <module>
    from .compat import Literal
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/compat.py", line 4, in <module>
    from thinc.util import copy_array
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/__init__.py", line 5, in <module>
    from .config import registry
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/config.py", line 5, in <module>
    from .types import Decorator
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/types.py", line 25, in <module>
    from .compat import cupy, has_cupy
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/compat.py", line 35, in <module>
    import torch
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/__init__.py", line 1477, in <module>
    from .functional import *  # noqa: F403
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/functional.py", line 9, in <module>
    import torch.nn.functional as F
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/__init__.py", line 1, in <module>
    from .modules import *  # noqa: F403
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/__init__.py", line 35, in <module>
    from .transformer import TransformerEncoder, TransformerDecoder, \
  File "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/transformer.py", line 20, in <module>
    device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),
/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/transformer.py:20: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1711403226120/work/torch/csrc/utils/tensor_numpy.cpp:84.)
  device: torch.device = torch.device(torch._C._get_default_device()),  # torch.device('cpu'),

import nltk

nltk.download("punkt")

[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True

nltk.download("vader_lexicon")
nltk.download("punkt")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mathias/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

s = "CPSC 330 students are smart, sweet, and funny."
print(sid.polarity_scores(s))

{'neg': 0.0, 'neu': 0.368, 'pos': 0.632, 'compound': 0.8225}

s = "CPSC 330 students are tired because of all the hard work they have been doing."
print(sid.polarity_scores(s))

{'neg': 0.249, 'neu': 0.751, 'pos': 0.0, 'compound': -0.5106}

spaCy #

A useful package for text processing and feature extraction

Active development: https://github.com/explosion/spaCy
Interactive lessons by Ines Montani: https://course.spacy.io/en/
Good documentation, easy to use, and customizable.

To run the code below, you have to download the pretrained model in the course environment.

python -m spacy download en_core_web_md

import spacy

nlp = spacy.load("en_core_web_md")

sample_text = """Dolly Parton is a gift to us all. 
From writing all-time great songs like “Jolene” and “I Will Always Love You”, 
to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine, 
she’s given us so much. Now, Netflix bring us Dolly Parton’s Christmas on the Square, 
an original musical that stars Christine Baranski as a Scrooge-like landowner 
who threatens to evict an entire town on Christmas Eve to make room for a new mall. 
Directed and choreographed by the legendary Debbie Allen and counting Jennifer Lewis 
and Parton herself amongst its cast, Christmas on the Square seems like the perfect movie
to save Christmas 2020. 😻 👍🏿"""

# [Adapted from here.](https://thepopbreak.com/2020/11/22/dolly-partons-christmas-on-the-square-review-not-quite-a-christmas-miracle/)

Spacy extracts all interesting information from text with this call.

doc = nlp(sample_text)

Let’s look at part-of-speech tags.

print([(token, token.pos_) for token in doc][:20])

[(Dolly, 'PROPN'), (Parton, 'PROPN'), (is, 'AUX'), (a, 'DET'), (gift, 'NOUN'), (to, 'ADP'), (us, 'PRON'), (all, 'PRON'), (., 'PUNCT'), (
, 'SPACE'), (From, 'ADP'), (writing, 'VERB'), (all, 'DET'), (-, 'PUNCT'), (time, 'NOUN'), (great, 'ADJ'), (songs, 'NOUN'), (like, 'ADP'), (“, 'PUNCT'), (Jolene, 'PROPN')]

Often we want to know who did what to whom.
Named entities give you this information.
What are named entities in the text?

from spacy import displacy

displacy.render(doc, style="ent")

Dolly Parton PERSON is a gift to us all.
From writing all-time great songs like “ Jolene PERSON ” and “ I Will Always Love You WORK_OF_ART ”,
to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine,
she’s given us so much. Now, Netflix ORG bring us Dolly Parton’s PERSON Christmas DATE on the Square,
an original musical that stars Christine Baranski PERSON as a Scrooge-like landowner
who threatens to evict an entire town on Christmas Eve DATE to make room for a new mall.
Directed and choreographed by the legendary Debbie Allen PERSON and counting Jennifer Lewis PERSON
and Parton PERSON herself amongst its cast, Christmas DATE on the Square seems like the perfect movie
to save Christmas 2020 DATE . 😻 👍🏿

print("Named entities:\n", [(ent.text, ent.label_) for ent in doc.ents])
print("\nORG means: ", spacy.explain("ORG"))
print("\nPERSON means: ", spacy.explain("PERSON"))
print("\nDATE means: ", spacy.explain("DATE"))

Named entities:
 [('Dolly Parton', 'PERSON'), ('Jolene', 'PERSON'), ('I Will Always Love You', 'WORK_OF_ART'), ('Netflix', 'ORG'), ('Dolly Parton’s', 'PERSON'), ('Christmas', 'DATE'), ('Christine Baranski', 'PERSON'), ('Christmas Eve', 'DATE'), ('Debbie Allen', 'PERSON'), ('Jennifer Lewis', 'PERSON'), ('Parton', 'PERSON'), ('Christmas', 'DATE'), ('Christmas 2020', 'DATE')]

ORG means:  Companies, agencies, institutions, etc.

PERSON means:  People, including fictional

DATE means:  Absolute or relative dates or periods

An example from a project#

Goal: Extract and visualize inter-corporate relationships from disclosed annual 10-K reports of public companies.

Source for the text below.

text = (
    "Heavy hitters, including Microsoft and Google, "
    "are competing for customers in cloud services with the likes of IBM and Salesforce."
)

doc = nlp(text)
displacy.render(doc, style="ent")
print("Named entities:\n", [(ent.text, ent.label_) for ent in doc.ents])

Heavy hitters, including Microsoft ORG and Google ORG , are competing for customers in cloud services with the likes of IBM ORG and Salesforce PERSON .

Named entities:
 [('Microsoft', 'ORG'), ('Google', 'ORG'), ('IBM', 'ORG'), ('Salesforce', 'PERSON')]

If you want emoji identification support install spacymoji in the course environment.

pip install spacymoji

After installing spacymoji, if it’s still complaining about module not found, my guess is that you do not have pip installed in your conda environment. Go to your course conda environment install pip and install the spacymoji package in the environment using the pip you just installed in the current environment.

conda install pip
YOUR_MINICONDA_PATH/miniconda3/envs/cpsc330/bin/pip install spacymoji

from spacymoji import Emoji

nlp.add_pipe("emoji", first=True);

Does the text have any emojis? If yes, extract the description.

doc = nlp(sample_text)
doc._.emoji

[('😻', 138, 'smiling cat with heart-eyes'),
 ('👍🏿', 139, 'thumbs up dark skin tone')]

Simple feature engineering for our problem.#

import en_core_web_md
import spacy

nlp = en_core_web_md.load()
from spacymoji import Emoji

nlp.add_pipe("emoji", first=True)

def get_relative_length(text, TWITTER_ALLOWED_CHARS=280.0):
    """
    Returns the relative length of text.

    Parameters:
    ------
    text: (str)
    the input text

    Keyword arguments:
    ------
    TWITTER_ALLOWED_CHARS: (float)
    the denominator for finding relative length

    Returns:
    -------
    relative length of text: (float)

    """
    return len(text) / TWITTER_ALLOWED_CHARS


def get_length_in_words(text):
    """
    Returns the length of the text in words.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    length of tokenized text: (int)

    """
    return len(nltk.word_tokenize(text))


def get_sentiment(text):
    """
    Returns the compound score representing the sentiment: -1 (most extreme negative) and +1 (most extreme positive)
    The compound score is a normalized score calculated by summing the valence scores of each word in the lexicon.

    Parameters:
    ------
    text: (str)
    the input text

    Returns:
    -------
    sentiment of the text: (str)
    """
    scores = sid.polarity_scores(text)
    return scores["compound"]

def get_avg_word_length(text):
    """
    Returns the average word length of the given text.

    Parameters:
    text -- (str)
    """
    words = text.split()
    return sum(len(word) for word in words) / len(words)


def has_emoji(text):
    """
    Returns the average word length of the given text.

    Parameters:
    text -- (str)
    """
    doc = nlp(text)
    return 1 if doc._.has_emoji else 0

import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/mathias/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!

True

train_df = train_df.assign(n_words=train_df["OriginalTweet"].apply(get_length_in_words))
train_df = train_df.assign(vader_sentiment=train_df["OriginalTweet"].apply(get_sentiment))
train_df = train_df.assign(rel_char_len=train_df["OriginalTweet"].apply(get_relative_length))

test_df = test_df.assign(n_words=test_df["OriginalTweet"].apply(get_length_in_words))
test_df = test_df.assign(vader_sentiment=test_df["OriginalTweet"].apply(get_sentiment))
test_df = test_df.assign(rel_char_len=test_df["OriginalTweet"].apply(get_relative_length))


train_df = train_df.assign(
    average_word_length=train_df["OriginalTweet"].apply(get_avg_word_length)
)
test_df = test_df.assign(average_word_length=test_df["OriginalTweet"].apply(get_avg_word_length))

# whether all letters are uppercase or not (all_caps)
train_df = train_df.assign(
    all_caps=train_df["OriginalTweet"].apply(lambda x: 1 if x.isupper() else 0)
)
test_df = test_df.assign(
    all_caps=test_df["OriginalTweet"].apply(lambda x: 1 if x.isupper() else 0)
)

train_df = train_df.assign(has_emoji=train_df["OriginalTweet"].apply(has_emoji))
test_df = test_df.assign(has_emoji=test_df["OriginalTweet"].apply(has_emoji))

train_df.head()

	UserName	ScreenName	Location	TweetAt	OriginalTweet	Sentiment	n_words	vader_sentiment	rel_char_len	average_word_length
1927	1928	46880	Seattle, WA	13-03-2020	While I don't like all of Amazon's choices, to...	Positive	31	-0.1053	0.589286	5.640000
1068	1069	46021	NaN	13-03-2020	Me: shit buckets, its time to do the weekly s...	Negative	52	-0.2500	0.932143	4.636364
803	804	45756	The Outer Limits	12-03-2020	@SecPompeo @realDonaldTrump You mean the plan ...	Neutral	44	0.0000	0.910714	6.741935
2846	2847	47799	Flagstaff, AZ	15-03-2020	@lauvagrande People who are sick arent panic ...	Extremely Negative	46	-0.8481	0.907143	5.023810
3768	3769	48721	Montreal, Canada	16-03-2020	Coronavirus Panic: Toilet Paper Is the People...	Negative	21	-0.5106	0.500000	9.846154

train_df.shape

(3038, 12)

(train_df['all_caps'] == 1).sum()

np.int64(0)

X_train = train_df.drop(columns=['Sentiment'])

numeric_features = ['vader_sentiment', 
                    'rel_char_len', 
                    'average_word_length']
passthrough_features = ['all_caps', 'has_emoji'] 
text_feature = 'OriginalTweet'
drop_features = ['UserName', 'ScreenName', 'Location', 'TweetAt']

preprocessor = make_column_transformer(
    (StandardScaler(), numeric_features),
    ("passthrough", passthrough_features), 
    (CountVectorizer(stop_words='english'), text_feature),
    ("drop", drop_features)
)

pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
results["LR (more feats)"] = mean_std_cross_val_scores(
    pipe, X_train, y_train, return_train_score=True, scoring=scoring_metrics
)
pd.DataFrame(results).T

	fit_time	score_time	test_score	train_score
dummy	0.001 (+/- 0.000)	0.000 (+/- 0.000)	0.280 (+/- 0.001)	0.280 (+/- 0.000)
logistic regression	0.306 (+/- 0.015)	0.008 (+/- 0.000)	0.414 (+/- 0.012)	0.999 (+/- 0.000)
LR (more feats)	0.296 (+/- 0.021)	0.010 (+/- 0.001)	0.690 (+/- 0.007)	0.998 (+/- 0.001)

cv_feats = pipe.named_steps['columntransformer'].named_transformers_['countvectorizer'].get_feature_names_out().tolist()

feat_names = numeric_features + passthrough_features + cv_feats

coefs = pipe.named_steps['logisticregression'].coef_[0]

df = pd.DataFrame(
    data={
        "features": feat_names,
        "coefficients": coefs,
    }
)
df.sort_values('coefficients')

	features	coefficients
0	vader_sentiment	-6.167241
11331	won	-1.384111
2551	coronapocalypse	-0.817034
2214	closed	-0.754165
8661	retail	-0.729109
...	...	...
9862	stupid	1.157157
3299	don	1.162007
4879	hell	1.312696
3129	die	1.365420
7504	panic	1.539459

11664 rows × 2 columns

We get some improvements with our engineered features!