Appendix C: Representing documents using embeddings#
Imports#
## Imports
import os
import re
import string
import sys
import time
sys.path.append(os.path.join(os.path.abspath("."), "code"))
import IPython
import numpy as np
import numpy.random as npr
import pandas as pd
from comat import CooccurrenceMatrix
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from preprocessing import MyPreprocessor
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
(Optional) Representing documents using word embeddings#
Assuming that we have reasonable representations of words.
How do we represent meaning of paragraphs or documents?
Two simple approaches
Averaging embeddings
Concatenating embeddings
Averaging embeddings#
All empty promises
\((embedding(all) + embedding(empty) + embedding(promise))/3\)
Average embeddings with spaCy#
We can do this conveniently with spaCy.
We need
en_core_web_md
model to access word vectors.You can download the model by going to command line and in your course
conda
environment and downloaden_core_web_md
as follows.
conda activate cpsc330
python -m spacy download en_core_web_md
We can access word vectors for individual words in spaCy
as follows.
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("pineapple") # extract all interesting information about the document
doc.vector
array([ 6.5486e-01, -2.2584e+00, 6.2793e-02, 1.8801e+00, 2.0700e-01,
-3.3299e+00, -9.6833e-01, 1.5131e+00, -3.7041e+00, -7.7749e-02,
1.5029e+00, -1.7764e+00, 1.7324e+00, 1.6241e+00, 2.6455e-01,
-3.0840e+00, 7.5715e-01, -1.2903e+00, 2.3571e+00, -3.8793e+00,
7.7635e-01, 3.9372e+00, 3.9900e-01, -6.8284e-01, -1.4018e+00,
-2.1673e+00, -1.9244e+00, 1.0629e+00, 3.3378e-01, -8.3864e-01,
-2.5646e-01, -1.7198e+00, -5.4607e-02, -1.4614e+00, 1.3352e+00,
-1.8177e+00, 1.7254e+00, 4.9624e-01, 1.1314e+00, -1.5295e+00,
-8.8629e-01, -2.7562e-01, 7.1799e-01, 1.5554e-01, 3.4230e+00,
2.7167e+00, 1.1793e+00, 2.0961e-01, 3.3121e-01, 1.2322e+00,
1.4375e+00, -4.2099e-01, 6.2814e-01, -1.9051e+00, 3.0593e-02,
6.1895e-01, -3.1495e-01, -2.0444e-04, 2.2073e+00, 3.8856e-01,
1.6554e+00, 1.1932e+00, 2.6678e+00, -5.5454e-01, -1.2078e+00,
1.5709e-01, -1.1324e+00, -2.0163e+00, 1.4567e+00, -2.4244e-01,
-1.9425e+00, 8.3090e-01, 1.7428e-01, 9.1676e-01, 8.8830e-03,
2.4857e-01, -1.2018e+00, -2.3073e+00, 2.2553e+00, -1.5853e+00,
-5.8452e-01, 9.2523e-01, -2.7129e-01, -7.6348e-01, 1.3506e+00,
1.7429e+00, 3.0469e+00, 1.9319e+00, -2.6099e+00, 1.8484e+00,
1.3795e+00, 2.0948e+00, 1.1545e+00, -2.9681e+00, -5.0455e-02,
-5.3864e-01, 2.4820e+00, -1.1131e+00, -2.1827e-01, -2.7559e+00,
-4.4502e-01, -2.8897e+00, 1.7430e+00, -1.5742e+00, 5.7160e-02,
2.4764e+00, -2.5828e+00, 9.3866e-01, -1.3150e+00, 2.3863e+00,
6.1536e-01, 1.7656e-01, 2.0245e+00, 1.6807e-01, -1.2850e+00,
1.6425e-01, 1.7782e+00, -3.2221e+00, 6.1392e-01, 1.3269e+00,
-3.1582e-02, 6.6331e-01, -6.8109e-01, 5.0985e-01, -4.2942e-01,
-1.6438e-01, 7.9306e-01, -3.0776e+00, 1.8022e+00, -4.5356e-01,
-1.6405e+00, 8.1761e-01, 1.4960e+00, -6.2266e-01, 8.5264e-01,
-5.0226e-01, -1.5735e+00, -4.5090e+00, -5.0587e-01, -1.5471e+00,
-5.3910e-01, -6.6574e-01, 7.6376e-01, -1.4926e+00, -7.8819e-01,
-9.9256e-01, 1.1512e+00, 5.2091e-01, 1.6460e-01, -2.6747e+00,
-1.7082e+00, 1.5789e+00, -2.8982e-01, -1.2842e+00, -1.1286e+00,
7.6392e-01, 3.2199e+00, 7.5850e-01, 1.3628e+00, -1.3231e+00,
2.2350e-02, -2.5602e+00, 6.7751e-01, 4.0511e-01, 1.8997e+00,
-1.1051e+00, -1.3014e-01, 7.2024e-01, 6.2354e-02, 1.1913e-01,
-1.1978e+00, -1.5625e+00, -2.5975e-01, 2.5911e+00, -3.2413e+00,
-3.8988e-01, -4.0542e-01, -1.8894e+00, 3.4278e+00, -3.3625e-01,
-2.0979e+00, 1.3275e+00, -2.0514e+00, 2.4583e-01, -7.3326e-01,
-2.3684e+00, 2.8493e+00, -5.2075e-01, 2.2708e-01, -6.8701e-01,
-7.0855e-01, -7.5334e-01, 7.3050e-02, 2.2246e+00, -2.6824e-01,
-2.8289e-01, -1.8230e+00, 2.2047e+00, -2.4848e-01, -2.3042e-02,
1.0358e+00, -2.7074e-01, -5.6816e-02, -9.1017e-01, 1.2943e-01,
-1.4274e+00, -3.6128e-01, 7.3127e-01, 2.0264e+00, 7.2928e-01,
1.7298e+00, 1.1075e+00, -7.0250e-01, 1.6928e+00, 2.0074e+00,
-7.5464e-01, 1.6378e+00, 3.5970e-01, -2.2128e-01, -1.7607e-01,
1.8260e+00, -2.5962e-01, -1.4320e+00, 7.8332e-01, 2.1438e+00,
-2.4723e+00, -1.4913e-01, 6.2585e-01, 6.6819e-01, 2.3947e+00,
-2.7173e+00, 2.4134e-03, -8.6530e-01, -9.7728e-01, -2.9815e+00,
1.6895e+00, -7.1146e-01, 3.2025e+00, -9.4129e-01, -1.9695e+00,
7.7711e-01, -3.2278e-01, -1.3727e+00, 2.9276e+00, -1.5440e-01,
1.7169e+00, 5.5736e-01, 1.4620e-01, -1.1244e+00, -2.4633e+00,
-2.2685e+00, 1.2459e+00, -2.0362e+00, -4.8331e-01, -6.3194e-01,
-2.4082e+00, -9.0132e-01, 3.0541e+00, -2.2632e+00, -3.7800e-01,
-3.1647e-01, 1.0785e+00, -3.0444e-01, 1.2112e+00, -1.3496e+00,
1.0599e+00, 4.2607e-01, 4.0194e-01, -2.8586e+00, 1.0107e+00,
1.5924e+00, -5.1770e-01, 1.3246e+00, 3.2268e-01, -1.3978e-01,
-2.1841e+00, 1.6548e+00, 1.3903e+00, 6.3376e-01, -4.7083e-01,
6.8377e-01, -1.3031e+00, -1.3292e-01, -1.1567e+00, 5.3419e-01,
-1.3412e+00, -1.5887e+00, -9.4468e-01, -2.4031e+00, 3.1785e+00,
1.1524e+00, -1.1699e+00, 9.8752e-01, -1.0660e+00, -2.1852e+00,
-3.1228e-01, 3.0012e+00, -1.2234e+00, 5.7454e-01, -2.1885e-01],
dtype=float32)
nlp("empty").vector[0:10]
array([ 0.010289, 4.9203 , -0.48081 , 3.5738 , -2.2516 , 2.1697 ,
-1.0116 , 2.4216 , -3.7343 , 3.3025 ], dtype=float32)
We can get average embeddings for a sentence or a document in spaCy
as follows:
s = "All empty promises"
doc = nlp(s)
avg_sent_emb = doc.vector
print(avg_sent_emb.shape)
print("Vector for: {}\n{}".format((s), (avg_sent_emb[0:10])))
(300,)
Vector for: All empty promises
[-0.459937 1.9785299 1.0319 1.5123 1.4806334 2.73183
1.204 1.1724668 -3.5227966 -0.05656664]
Similarity between documents#
We can also get similarity between documents as follows.
Note that this is based on average embeddings of each sentence.
doc1 = nlp("Deep learning is very popular these days.")
doc2 = nlp("Machine learning is dominated by neural networks.")
doc3 = nlp("A home-made fresh bread with butter and cheese.")
# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))
print(doc2, "<->", doc3, doc2.similarity(doc3))
Deep learning is very popular these days. <-> Machine learning is dominated by neural networks. 0.699868820717508
Machine learning is dominated by neural networks. <-> A home-made fresh bread with butter and cheese. 0.5098293421139041
Do these scores make sense?
There are no common words, but we are still able to identify that doc1 and doc2 are more similar that doc2 and doc3.
You can use such average embedding representation in text classification tasks.
Airline sentiment analysis using average embedding representation#
Let’s try average embedding representation for airline sentiment analysis.
You can download the data here.
df = pd.read_csv("data/Airline-Sentiment-2-w-AA.csv", encoding="ISO-8859-1")
from sklearn.model_selection import cross_validate, train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)
X_train, y_train = train_df["text"], train_df["airline_sentiment"]
X_test, y_test = test_df["text"], test_df["airline_sentiment"]
train_df.head()
_unit_id | _golden | _unit_state | _trusted_judgments | _last_judgment_at | airline_sentiment | airline_sentiment:confidence | negativereason | negativereason:confidence | airline | airline_sentiment_gold | name | negativereason_gold | retweet_count | text | tweet_coord | tweet_created | tweet_id | tweet_location | user_timezone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5789 | 681455792 | False | finalized | 3 | 2/25/15 4:21 | negative | 1.0 | Can't Tell | 0.6667 | Southwest | NaN | mrssuperdimmock | NaN | 0 | @SouthwestAir link doesn't work | NaN | 2/19/15 18:53 | 5.686040e+17 | Lake Arrowhead, CA | Pacific Time (US & Canada) |
8918 | 681459957 | False | finalized | 3 | 2/25/15 9:45 | neutral | 1.0 | NaN | NaN | Delta | NaN | labeles | NaN | 0 | @JetBlue okayyyy. But I had huge irons on way ... | NaN | 2/17/15 10:18 | 5.677500e+17 | NaN | NaN |
11688 | 681462990 | False | finalized | 3 | 2/25/15 9:53 | negative | 1.0 | Customer Service Issue | 0.6727 | US Airways | NaN | DropMeAnywhere | NaN | 0 | @USAirways They're all reservations numbers an... | [0.0, 0.0] | 2/17/15 14:50 | 5.678190e+17 | Here, There and Everywhere | Arizona |
413 | 681448905 | False | finalized | 3 | 2/25/15 10:10 | neutral | 1.0 | NaN | NaN | Virgin America | NaN | jsamaudio | NaN | 0 | @VirginAmerica no A's channel this year? | NaN | 2/18/15 12:25 | 5.681440e+17 | St. Francis (Calif.) | Pacific Time (US & Canada) |
4135 | 681454122 | False | finalized | 3 | 2/25/15 10:08 | negative | 1.0 | Bad Flight | 0.3544 | United | NaN | CajunSQL | NaN | 0 | @united missed it. Incoming on time, then Sat... | NaN | 2/17/15 14:20 | 5.678110e+17 | Baton Rouge, LA | NaN |
Bag-of-words representation for sentiment analysis#
pipe = make_pipeline(
CountVectorizer(stop_words="english"), LogisticRegression(max_iter=1000)
)
pipe.named_steps["countvectorizer"].fit(X_train)
X_train_transformed = pipe.named_steps["countvectorizer"].transform(X_train)
print("Data matrix shape:", X_train_transformed.shape)
pipe.fit(X_train, y_train);
Data matrix shape: (11712, 13064)
print("Train accuracy {:.2f}".format(pipe.score(X_train, y_train)))
print("Test accuracy {:.2f}".format(pipe.score(X_test, y_test)))
Train accuracy 0.94
Test accuracy 0.80
Sentiment analysis with average embedding representation#
Let’s see how can we get word vectors using
spaCy
.Let’s create average embedding representation for each example.
X_train_embeddings = pd.DataFrame([text.vector for text in nlp.pipe(X_train)])
X_test_embeddings = pd.DataFrame([text.vector for text in nlp.pipe(X_test)])
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
Cell In[11], line 2
1 X_train_embeddings = pd.DataFrame([text.vector for text in nlp.pipe(X_train)])
----> 2 X_test_embeddings = pd.DataFrame([text.vector for text in nlp.pipe(X_test)])
Cell In[11], line 2, in <listcomp>(.0)
1 X_train_embeddings = pd.DataFrame([text.vector for text in nlp.pipe(X_train)])
----> 2 X_test_embeddings = pd.DataFrame([text.vector for text in nlp.pipe(X_test)])
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/language.py:1618, in Language.pipe(self, texts, as_tuples, batch_size, disable, component_cfg, n_process)
1616 for pipe in pipes:
1617 docs = pipe(docs)
-> 1618 for doc in docs:
1619 yield doc
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1685, in _pipe(docs, proc, name, default_error_handler, kwargs)
1675 def _pipe(
1676 docs: Iterable["Doc"],
1677 proc: "PipeCallable",
(...)
1682 kwargs: Mapping[str, Any],
1683 ) -> Iterator["Doc"]:
1684 if hasattr(proc, "pipe"):
-> 1685 yield from proc.pipe(docs, **kwargs)
1686 else:
1687 # We added some args for pipe that __call__ doesn't expect.
1688 kwargs = dict(kwargs)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/transition_parser.pyx:245, in pipe()
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1632, in minibatch(items, size)
1630 while True:
1631 batch_size = next(size_)
-> 1632 batch = list(itertools.islice(items, int(batch_size)))
1633 if len(batch) == 0:
1634 break
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1685, in _pipe(docs, proc, name, default_error_handler, kwargs)
1675 def _pipe(
1676 docs: Iterable["Doc"],
1677 proc: "PipeCallable",
(...)
1682 kwargs: Mapping[str, Any],
1683 ) -> Iterator["Doc"]:
1684 if hasattr(proc, "pipe"):
-> 1685 yield from proc.pipe(docs, **kwargs)
1686 else:
1687 # We added some args for pipe that __call__ doesn't expect.
1688 kwargs = dict(kwargs)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/pipe.pyx:55, in pipe()
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1685, in _pipe(docs, proc, name, default_error_handler, kwargs)
1675 def _pipe(
1676 docs: Iterable["Doc"],
1677 proc: "PipeCallable",
(...)
1682 kwargs: Mapping[str, Any],
1683 ) -> Iterator["Doc"]:
1684 if hasattr(proc, "pipe"):
-> 1685 yield from proc.pipe(docs, **kwargs)
1686 else:
1687 # We added some args for pipe that __call__ doesn't expect.
1688 kwargs = dict(kwargs)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/pipe.pyx:55, in pipe()
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1685, in _pipe(docs, proc, name, default_error_handler, kwargs)
1675 def _pipe(
1676 docs: Iterable["Doc"],
1677 proc: "PipeCallable",
(...)
1682 kwargs: Mapping[str, Any],
1683 ) -> Iterator["Doc"]:
1684 if hasattr(proc, "pipe"):
-> 1685 yield from proc.pipe(docs, **kwargs)
1686 else:
1687 # We added some args for pipe that __call__ doesn't expect.
1688 kwargs = dict(kwargs)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/transition_parser.pyx:245, in pipe()
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1632, in minibatch(items, size)
1630 while True:
1631 batch_size = next(size_)
-> 1632 batch = list(itertools.islice(items, int(batch_size)))
1633 if len(batch) == 0:
1634 break
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1685, in _pipe(docs, proc, name, default_error_handler, kwargs)
1675 def _pipe(
1676 docs: Iterable["Doc"],
1677 proc: "PipeCallable",
(...)
1682 kwargs: Mapping[str, Any],
1683 ) -> Iterator["Doc"]:
1684 if hasattr(proc, "pipe"):
-> 1685 yield from proc.pipe(docs, **kwargs)
1686 else:
1687 # We added some args for pipe that __call__ doesn't expect.
1688 kwargs = dict(kwargs)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/trainable_pipe.pyx:73, in pipe()
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1632, in minibatch(items, size)
1630 while True:
1631 batch_size = next(size_)
-> 1632 batch = list(itertools.islice(items, int(batch_size)))
1633 if len(batch) == 0:
1634 break
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/util.py:1685, in _pipe(docs, proc, name, default_error_handler, kwargs)
1675 def _pipe(
1676 docs: Iterable["Doc"],
1677 proc: "PipeCallable",
(...)
1682 kwargs: Mapping[str, Any],
1683 ) -> Iterator["Doc"]:
1684 if hasattr(proc, "pipe"):
-> 1685 yield from proc.pipe(docs, **kwargs)
1686 else:
1687 # We added some args for pipe that __call__ doesn't expect.
1688 kwargs = dict(kwargs)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/trainable_pipe.pyx:75, in pipe()
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/spacy/pipeline/tok2vec.py:126, in Tok2Vec.predict(self, docs)
124 width = self.model.get_dim("nO")
125 return [self.model.ops.alloc((0, width)) for doc in docs]
--> 126 tokvecs = self.model.predict(docs)
127 return tokvecs
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:334, in Model.predict(self, X)
330 def predict(self, X: InT) -> OutT:
331 """Call the model's `forward` function with `is_train=False`, and return
332 only the output, instead of the `(output, callback)` tuple.
333 """
--> 334 return self._func(self, X, is_train=False)[0]
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/chain.py:54, in forward(model, X, is_train)
52 callbacks = []
53 for layer in model.layers:
---> 54 Y, inc_layer_grad = layer(X, is_train=is_train)
55 callbacks.append(inc_layer_grad)
56 X = Y
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:310, in Model.__call__(self, X, is_train)
307 def __call__(self, X: InT, is_train: bool) -> Tuple[OutT, Callable]:
308 """Call the model's `forward` function, returning the output and a
309 callback to compute the gradients via backpropagation."""
--> 310 return self._func(self, X, is_train=is_train)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/with_array.py:42, in forward(model, Xseq, is_train)
40 return model.layers[0](Xseq, is_train)
41 else:
---> 42 return cast(Tuple[SeqT, Callable], _list_forward(model, Xseq, is_train))
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/with_array.py:77, in _list_forward(model, Xs, is_train)
75 lengths = NUMPY_OPS.asarray1i([len(seq) for seq in Xs])
76 Xf = layer.ops.flatten(Xs, pad=pad)
---> 77 Yf, get_dXf = layer(Xf, is_train)
79 def backprop(dYs: ListXd) -> ListXd:
80 dYf = layer.ops.flatten(dYs, pad=pad)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:310, in Model.__call__(self, X, is_train)
307 def __call__(self, X: InT, is_train: bool) -> Tuple[OutT, Callable]:
308 """Call the model's `forward` function, returning the output and a
309 callback to compute the gradients via backpropagation."""
--> 310 return self._func(self, X, is_train=is_train)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/chain.py:54, in forward(model, X, is_train)
52 callbacks = []
53 for layer in model.layers:
---> 54 Y, inc_layer_grad = layer(X, is_train=is_train)
55 callbacks.append(inc_layer_grad)
56 X = Y
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:310, in Model.__call__(self, X, is_train)
307 def __call__(self, X: InT, is_train: bool) -> Tuple[OutT, Callable]:
308 """Call the model's `forward` function, returning the output and a
309 callback to compute the gradients via backpropagation."""
--> 310 return self._func(self, X, is_train=is_train)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/residual.py:41, in forward(model, X, is_train)
38 else:
39 return d_output + dX
---> 41 Y, backprop_layer = model.layers[0](X, is_train)
42 if isinstance(X, list):
43 return [X[i] + Y[i] for i in range(len(X))], backprop
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:310, in Model.__call__(self, X, is_train)
307 def __call__(self, X: InT, is_train: bool) -> Tuple[OutT, Callable]:
308 """Call the model's `forward` function, returning the output and a
309 callback to compute the gradients via backpropagation."""
--> 310 return self._func(self, X, is_train=is_train)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/chain.py:54, in forward(model, X, is_train)
52 callbacks = []
53 for layer in model.layers:
---> 54 Y, inc_layer_grad = layer(X, is_train=is_train)
55 callbacks.append(inc_layer_grad)
56 X = Y
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:310, in Model.__call__(self, X, is_train)
307 def __call__(self, X: InT, is_train: bool) -> Tuple[OutT, Callable]:
308 """Call the model's `forward` function, returning the output and a
309 callback to compute the gradients via backpropagation."""
--> 310 return self._func(self, X, is_train=is_train)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/chain.py:54, in forward(model, X, is_train)
52 callbacks = []
53 for layer in model.layers:
---> 54 Y, inc_layer_grad = layer(X, is_train=is_train)
55 callbacks.append(inc_layer_grad)
56 X = Y
[... skipping similar frames: Model.__call__ at line 310 (1 times)]
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/chain.py:54, in forward(model, X, is_train)
52 callbacks = []
53 for layer in model.layers:
---> 54 Y, inc_layer_grad = layer(X, is_train=is_train)
55 callbacks.append(inc_layer_grad)
56 X = Y
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/model.py:310, in Model.__call__(self, X, is_train)
307 def __call__(self, X: InT, is_train: bool) -> Tuple[OutT, Callable]:
308 """Call the model's `forward` function, returning the output and a
309 callback to compute the gradients via backpropagation."""
--> 310 return self._func(self, X, is_train=is_train)
File ~/miniconda3/envs/cpsc330/lib/python3.10/site-packages/thinc/layers/maxout.py:52, in forward(model, X, is_train)
50 W = model.get_param("W")
51 W = model.ops.reshape2f(W, nO * nP, nI)
---> 52 Y = model.ops.gemm(X, W, trans2=True)
53 Y += model.ops.reshape1f(b, nO * nP)
54 Z = model.ops.reshape3f(Y, Y.shape[0], nO, nP)
KeyboardInterrupt:
We have reduced dimensionality from 13,064 to 300!
X_train_embeddings.shape
(11712, 300)
X_train_embeddings.head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.259942 | 4.856640 | -2.677500 | -1.875390 | 0.459240 | -0.304300 | 3.273020 | 2.008458 | -4.818360 | 1.002320 | ... | 1.555840 | 1.274170 | 0.716420 | -1.296220 | -1.403746 | 0.430054 | 1.107044 | 0.224080 | -0.903256 | -0.295254 |
1 | -0.563826 | 0.869816 | -2.462877 | 0.057751 | 0.892089 | 0.566698 | -0.283121 | 3.594112 | -1.578107 | 1.037976 | ... | 0.270313 | -0.948990 | 2.653996 | -1.631425 | -1.850329 | 0.432467 | 0.979464 | 1.015146 | -2.604236 | -0.188137 |
2 | -0.707503 | 0.908782 | -3.248327 | -0.667797 | 2.590958 | 2.246804 | -1.095754 | 3.953429 | -1.524224 | 0.593270 | ... | 0.948402 | -1.760466 | 1.369070 | -2.204359 | -1.357214 | 0.389199 | 0.205336 | 0.100318 | -2.950879 | 1.399726 |
3 | -0.977736 | 4.676425 | -0.224362 | -1.011286 | 3.981441 | -1.132660 | 0.988456 | 2.997113 | 0.553975 | -0.539687 | ... | -0.229152 | 1.099266 | 1.652113 | 0.036738 | 0.315839 | 1.429018 | 0.557437 | -0.333650 | -0.281916 | -0.587222 |
4 | -0.725984 | 0.178514 | -1.163662 | 0.597000 | 4.621603 | -1.166608 | 1.256955 | 3.940082 | 0.530063 | -0.816050 | ... | -0.398607 | -0.219403 | 0.925384 | -0.751906 | -1.969212 | 1.381326 | 2.146303 | 0.195811 | -2.914482 | 0.297692 |
5 rows × 300 columns
lgr = LogisticRegression(max_iter=2000)
lgr.fit(X_train_embeddings, y_train)
print("Train accuracy {:.2f}".format(lgr.score(X_train_embeddings, y_train)))
print("Test accuracy {:.2f}".format(lgr.score(X_test_embeddings, y_test)))
Train accuracy 0.80
Test accuracy 0.79
Sentiment classification using average embeddings#
What are the train and test accuracies with average word embedding representation?
The accuracy is similar with less overfitting.
Note that we are using transfer learning here.
The embeddings are trained on a completely different corpus.
(Optional) Sentiment classification using advanced sentence representations#
Since, representing documents is so essential for text classification tasks, there are more advanced methods for document representation.
In homework 6, you also explore sentence embedding representation.
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("paraphrase-distilroberta-base-v1")
emb_sents = embedder.encode("all empty promises")
emb_sents.shape
(768,)
emb_train = embedder.encode(train_df["text"].tolist())
emb_train_df = pd.DataFrame(emb_train, index=train_df.index)
emb_train_df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5789 | -0.120494 | 0.250263 | -0.022795 | -0.116368 | 0.078650 | 0.037357 | -0.251341 | 0.321429 | -0.143984 | -0.123486 | ... | 0.199150 | -0.150143 | 0.167078 | -0.407671 | -0.066161 | 0.049514 | 0.019384 | -0.357602 | 0.125996 | 0.381074 |
8918 | -0.182954 | 0.118282 | 0.066341 | -0.136098 | 0.094947 | -0.121303 | 0.069233 | -0.097500 | 0.025740 | -0.367981 | ... | 0.113612 | 0.114662 | 0.049926 | 0.256736 | -0.118687 | -0.190720 | 0.011985 | -0.141883 | -0.230142 | 0.024899 |
11688 | -0.032988 | 0.630251 | -0.079516 | 0.148981 | 0.194708 | -0.226263 | -0.043630 | 0.217398 | -0.010715 | 0.069644 | ... | 0.676791 | 0.244484 | 0.051042 | 0.064099 | -0.146945 | 0.090878 | -0.090060 | 0.077211 | -0.209226 | 0.308773 |
413 | -0.119258 | 0.172168 | 0.098697 | 0.319858 | 0.415475 | 0.248359 | -0.025923 | 0.385350 | 0.066414 | -0.334289 | ... | -0.128482 | -0.232446 | -0.077805 | 0.181329 | 0.123244 | -0.143693 | 0.660457 | -0.048714 | 0.204774 | 0.163497 |
4135 | 0.094240 | 0.360193 | 0.213747 | 0.363690 | 0.275521 | 0.134936 | -0.276319 | 0.009336 | -0.021523 | -0.258992 | ... | 0.474885 | 0.242125 | 0.294533 | 0.279013 | 0.037831 | 0.089761 | -0.548748 | -0.049258 | 0.154525 | 0.141268 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5218 | -0.204408 | -0.145289 | -0.064201 | 0.213571 | -0.140225 | 0.338556 | -0.148578 | 0.224515 | -0.042963 | 0.075930 | ... | -0.161948 | 0.040582 | 0.003971 | -0.152549 | -0.582907 | -0.126526 | 0.060502 | -0.111495 | -0.097493 | 0.199321 |
12252 | 0.108408 | 0.438293 | 0.216812 | -0.349289 | 0.422689 | 0.377760 | 0.045198 | -0.034095 | 0.427570 | -0.328272 | ... | 0.257849 | -0.032363 | -0.275004 | 0.080452 | -0.078975 | -0.049972 | -0.009761 | -0.314754 | -0.020773 | 0.268777 |
1346 | 0.068411 | 0.017591 | 0.236154 | 0.221446 | -0.103568 | 0.055510 | 0.062910 | 0.067424 | -0.003504 | -0.157757 | ... | 0.007711 | 0.323297 | 0.334637 | 0.367041 | -0.068821 | 0.063667 | -0.329990 | 0.232330 | -0.184768 | -0.000682 |
11646 | -0.091488 | -0.155708 | 0.032391 | 0.018314 | 0.524997 | 0.563933 | -0.080985 | 0.097982 | -0.535285 | -0.377194 | ... | 0.428013 | -0.144572 | 0.045297 | -0.107935 | -0.135673 | -0.290019 | -0.137200 | -0.503395 | -0.042567 | -0.282592 |
3582 | 0.185626 | 0.092904 | 0.097085 | -0.174650 | -0.193584 | 0.047294 | 0.098216 | 0.332670 | 0.163098 | -0.135101 | ... | 0.078530 | -0.030177 | 0.391598 | 0.073520 | -0.454037 | -0.244358 | -0.790682 | -0.607010 | -0.255162 | 0.029779 |
11712 rows × 768 columns
emb_test = embedder.encode(test_df["text"].tolist())
emb_test_df = pd.DataFrame(emb_test, index=test_df.index)
emb_test_df
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1671 | -0.002864 | 0.217326 | 0.124349 | -0.082548 | 0.709688 | -0.582441 | 0.257897 | 0.169356 | 0.248880 | -0.266686 | ... | 0.501767 | 0.095387 | 0.340173 | 0.087452 | -0.368359 | 0.276195 | 0.238676 | -0.219546 | 0.066603 | 0.256149 |
10951 | -0.141048 | 0.137934 | 0.131319 | 0.194773 | 0.868204 | 0.078791 | -0.131656 | 0.036244 | -0.215749 | -0.291946 | ... | -0.056256 | -0.056041 | 0.147341 | 0.189665 | -0.357366 | 0.061799 | -0.161923 | -0.278955 | -0.173722 | 0.065324 |
5382 | -0.252943 | 0.527507 | -0.065608 | 0.013467 | 0.207989 | 0.003881 | -0.066281 | 0.253166 | 0.021039 | 0.290957 | ... | 0.180686 | -0.042605 | -0.173794 | -0.079128 | -0.169160 | 0.001316 | -0.142593 | -0.070816 | -0.208826 | 0.400737 |
3954 | 0.054319 | 0.096738 | 0.113037 | 0.032039 | 0.493064 | -0.641102 | 0.078760 | 0.402187 | 0.189743 | -0.089538 | ... | 0.123879 | -0.285019 | -0.297771 | 0.557171 | 0.076169 | -0.029826 | -0.076095 | 0.225454 | 0.002135 | 0.235430 |
11193 | -0.065858 | 0.223270 | 0.507333 | 0.266193 | 0.104696 | -0.219555 | 0.146247 | 0.315649 | -0.126193 | -0.435461 | ... | 0.163994 | 0.207813 | -0.001871 | 0.109391 | -0.166778 | -0.249199 | -0.525419 | -0.413066 | 0.119939 | 0.064297 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
5861 | 0.077512 | 0.322276 | 0.026697 | -0.111393 | 0.174207 | 0.235201 | 0.053888 | 0.244942 | 0.181625 | -0.226870 | ... | 0.149843 | 0.311338 | 0.045975 | -0.572319 | -0.068256 | 0.217745 | -0.056509 | -0.355174 | -0.028610 | 0.090676 |
3627 | -0.173311 | -0.023604 | 0.190388 | -0.136543 | -0.360269 | -0.444686 | 0.056311 | 0.291941 | -0.399719 | -0.167930 | ... | 0.042209 | -0.161905 | -0.040535 | -0.050515 | -0.252020 | -0.133980 | 0.155001 | -0.154482 | -0.060201 | -0.126556 |
12559 | -0.124635 | -0.101799 | 0.129061 | 0.636907 | 0.681090 | 0.399300 | -0.078321 | 0.221824 | -0.277218 | -0.178589 | ... | 0.022364 | -0.109275 | -0.073540 | -0.153336 | -0.123705 | -0.238896 | 0.296446 | -0.116798 | 0.115076 | -0.345925 |
8123 | 0.063508 | 0.332506 | 0.119605 | -0.001363 | -0.161802 | -0.082302 | -0.025883 | 0.048027 | 0.126974 | -0.159802 | ... | 0.002221 | -0.093885 | 0.430285 | -0.088561 | 0.321488 | 0.447437 | 0.292395 | -0.188566 | -0.272767 | 0.126173 |
210 | 0.015537 | 0.425568 | 0.350672 | 0.113120 | -0.128615 | 0.098112 | 0.222081 | 0.101654 | 0.224073 | -0.341074 | ... | 0.100983 | -0.008055 | 0.202025 | 0.029846 | -0.019182 | 0.107064 | 0.002301 | 0.038213 | -0.139270 | -0.007586 |
2928 rows × 768 columns
lgr = LogisticRegression(max_iter=1000)
lgr.fit(emb_train, y_train)
print("Train accuracy {:.2f}".format(lgr.score(emb_train, y_train)))
print("Test accuracy {:.2f}".format(lgr.score(emb_test, y_test)))
Train accuracy 0.87
Test accuracy 0.83
Some improvement over bag of words and average embedding representations!
But much slower …
(Optional) Training LDA with gensim#
Above we are creating an LDA model with
sklearn
. If you want more flexibility, you can use a Gensim’s LDATo train an LDA model with gensim, you need
Document-term matrix
Dictionary (vocabulary)
The number of topics (\(K\)):
num_topics
The number of passes:
passes
Gensim
’s doc2bow
#
Let’s first create a dictionary using
corpora.Dictionary
.
import wikipedia
queries = [
"Artificial Intelligence",
"unsupervised learning",
"Supreme Court of Canada",
"Peace, Order, and Good Government",
"Canadian constitutional law",
"ice hockey",
]
wiki_dict = {"wiki query": [], "text": []}
for i in range(len(queries)):
wiki_dict["text"].append(wikipedia.page(queries[i]).content)
wiki_dict["wiki query"].append(queries[i])
wiki_df = pd.DataFrame(wiki_dict)
wiki_df
wiki query | text | |
---|---|---|
0 | Artificial Intelligence | Artificial intelligence (AI) is the intelligen... |
1 | unsupervised learning | Supervised learning (SL) is a paradigm in mach... |
2 | Supreme Court of Canada | The Supreme Court of Canada (SCC; French: Cour... |
3 | Peace, Order, and Good Government | In many Commonwealth jurisdictions, the phrase... |
4 | Canadian constitutional law | Canadian constitutional law (French: droit con... |
5 | ice hockey | Ice hockey (or simply hockey) is a team sport ... |
import spacy
nlp = spacy.load("en_core_web_md", disable=["parser", "ner"])
def preprocess(
doc,
min_token_len=2,
irrelevant_pos=["ADV", "PRON", "CCONJ", "PUNCT", "PART", "DET", "ADP", "SPACE"],
):
"""
Given text, min_token_len, and irrelevant_pos carry out preprocessing of the text
and return a preprocessed string.
Parameters
-------------
doc : (spaCy doc object)
the spacy doc object of the text
min_token_len : (int)
min_token_length required
irrelevant_pos : (list)
a list of irrelevant pos tags
Returns
-------------
(str) the preprocessed text
"""
clean_text = []
for token in doc:
if (
token.is_stop == False # Check if it's not a stopword
and len(token) > min_token_len # Check if the word meets minimum threshold
and token.pos_ not in irrelevant_pos
): # Check if the POS is in the acceptable POS tags
lemma = token.lemma_ # Take the lemma of the word
clean_text.append(lemma.lower())
return " ".join(clean_text)
wiki_df["text_pp"] = [preprocess(text) for text in nlp.pipe(wiki_df["text"])]
wiki_df
wiki query | text | text_pp | |
---|---|---|---|
0 | Artificial Intelligence | Artificial intelligence (AI) is the intelligen... | artificial intelligence intelligence machine s... |
1 | unsupervised learning | Supervised learning (SL) is a paradigm in mach... | supervised learning paradigm machine learning ... |
2 | Supreme Court of Canada | The Supreme Court of Canada (SCC; French: Cour... | supreme court canada scc french cour suprême c... |
3 | Peace, Order, and Good Government | In many Commonwealth jurisdictions, the phrase... | commonwealth jurisdiction phrase peace order g... |
4 | Canadian constitutional law | Canadian constitutional law (French: droit con... | canadian constitutional law french droit const... |
5 | ice hockey | Ice hockey (or simply hockey) is a team sport ... | ice hockey hockey team sport play ice skate ic... |
import gensim
import gensim.corpora as corpora
corpus = [doc.split() for doc in wiki_df["text_pp"].tolist()]
dictionary = corpora.Dictionary(corpus) # Create a vocabulary for the lda model
pd.DataFrame(
dictionary.token2id.keys(), index=dictionary.token2id.values(), columns=["Word"]
)
Word | |
---|---|
0 | "criticism |
1 | 0070087705.these |
2 | 0134610993 |
3 | 127 |
4 | 1863 |
... | ... |
3789 | works |
3790 | worldwide |
3791 | youth |
3792 | zhenskaya |
3793 | zonal |
3794 rows × 1 columns
Gensim
’s doc2bow
#
Now let’s convert our corpus into document-term matrix for LDA using
dictionary.doc2bow
.For each document, it stores the frequency of each token in the document in the format (token_id, frequency).
doc_term_matrix = [dictionary.doc2bow(doc) for doc in corpus]
doc_term_matrix[1][:20]
[(61, 4),
(76, 1),
(81, 3),
(82, 1),
(88, 1),
(89, 1),
(92, 3),
(123, 51),
(130, 1),
(141, 4),
(143, 1),
(155, 1),
(156, 1),
(157, 4),
(158, 3),
(171, 1),
(174, 1),
(181, 2),
(190, 2),
(202, 3)]
Now we are ready to train an LDA model.
from gensim.models import LdaModel
from gensim.models import LdaModel
num_topics = 3
lda = gensim.models.LdaModel(
corpus=doc_term_matrix,
id2word=dictionary,
num_topics=num_topics,
random_state=42,
passes=10,
)
Examine the topics and topic distribution for a document in our LDA model#
lda.print_topics(num_words=4) # Topics
[(0, '0.031*"hockey" + 0.022*"ice" + 0.020*"player" + 0.019*"team"'),
(1,
'0.010*"learning" + 0.009*"algorithm" + 0.009*"machine" + 0.009*"intelligence"'),
(2, '0.029*"court" + 0.015*"law" + 0.012*"provincial" + 0.012*"government"')]
print("Document: ", wiki_df.iloc[0].iloc[0])
print("Topic assignment for document: ", lda[doc_term_matrix[0]]) # Topic distribution
Document: Artificial Intelligence
Topic assignment for document: [(1, 0.99986523)]
You can also visualize the topics using pyLDAvis
.
pip install pyLDAvis
Do not install it using
conda
. They have made some changes in the recent version andconda
build is not available for this version yet.
Visualize topics#
# import pyLDAvis
# import pyLDAvis.gensim_models as gensimvis
# vis = gensimvis.prepare(lda, doc_term_matrix, dictionary, sort_topics=False)
# vis