"
],
"text/plain": [
" UserName ScreenName Location TweetAt \\\n",
"1927 1928 46880 Seattle, WA 13-03-2020 \n",
"1068 1069 46021 NaN 13-03-2020 \n",
"803 804 45756 The Outer Limits 12-03-2020 \n",
"2846 2847 47799 Flagstaff, AZ 15-03-2020 \n",
"3768 3769 48721 Montreal, Canada 16-03-2020 \n",
"... ... ... ... ... \n",
"1122 1123 46075 NaN 13-03-2020 \n",
"1346 1347 46299 Toronto 13-03-2020 \n",
"3454 3455 48407 Houston, TX 16-03-2020 \n",
"3437 3438 48390 Washington, DC 16-03-2020 \n",
"3582 3583 48535 St James' Park, Newcastle 16-03-2020 \n",
"\n",
" OriginalTweet Sentiment \n",
"1927 While I don't like all of Amazon's choices, to... Positive \n",
"1068 Me: shit buckets, its time to do the weekly s... Negative \n",
"803 @SecPompeo @realDonaldTrump You mean the plan ... Neutral \n",
"2846 @lauvagrande People who are sick arent panic ... Extremely Negative \n",
"3768 Coronavirus Panic: Toilet Paper Is the People... Negative \n",
"... ... ... \n",
"1122 Photos of our local grocery store shelveswher... Extremely Positive \n",
"1346 Just went to the the grocery store (Highland F... Positive \n",
"3454 Real talk though. Am I the only one spending h... Neutral \n",
"3437 The supermarket business is booming! #COVID2019 Neutral \n",
"3582 Evening All Here s the story on the and the im... Positive \n",
"\n",
"[3038 rows x 6 columns]"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',\n",
" 'Sentiment'],\n",
" dtype='object')"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Location\n",
"United States 63\n",
"London, England 37\n",
"Los Angeles, CA 30\n",
"New York, NY 29\n",
"Washington, DC 29\n",
" ..\n",
"Suburb of Chicago 1\n",
"philippines 1\n",
"Dont ask for freedom, take it. 1\n",
"Windsor Heights, IA 1\n",
"St James' Park, Newcastle 1\n",
"Name: count, Length: 1441, dtype: int64"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_df['Location'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"X_train, y_train = train_df[['OriginalTweet', 'Location']], train_df['Sentiment']\n",
"X_test, y_test = test_df[['OriginalTweet', 'Location']], test_df['Sentiment']"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Sentiment\n",
"Negative 852\n",
"Positive 743\n",
"Neutral 501\n",
"Extremely Negative 472\n",
"Extremely Positive 470\n",
"Name: count, dtype: int64"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_train.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"scoring_metrics = 'accuracy'"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"results = {}"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):\n",
" \"\"\"\n",
" Returns mean and std of cross validation\n",
"\n",
" Parameters\n",
" ----------\n",
" model :\n",
" scikit-learn model\n",
" X_train : numpy array or pandas DataFrame\n",
" X in the training data\n",
" y_train :\n",
" y in the training data\n",
"\n",
" Returns\n",
" ----------\n",
" pandas Series with mean scores from cross_validation\n",
" \"\"\"\n",
"\n",
" scores = cross_validate(model, X_train, y_train, **kwargs)\n",
"\n",
" mean_scores = pd.DataFrame(scores).mean()\n",
" std_scores = pd.DataFrame(scores).std()\n",
" out_col = []\n",
"\n",
" for i in range(len(mean_scores)):\n",
" out_col.append((f\"%0.3f (+/- %0.3f)\" % (mean_scores[i], std_scores[i])))\n",
"\n",
" return pd.Series(data=out_col, index=mean_scores.index)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Dummy classifier"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/b3/g26r0dcx4b35vf3nk31216hc0000gr/T/ipykernel_13054/4158382658.py:26: FutureWarning:\n",
"\n",
"Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
"\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fit_time
\n",
"
score_time
\n",
"
test_score
\n",
"
train_score
\n",
"
\n",
" \n",
" \n",
"
\n",
"
dummy
\n",
"
0.001 (+/- 0.001)
\n",
"
0.001 (+/- 0.001)
\n",
"
0.280 (+/- 0.001)
\n",
"
0.280 (+/- 0.000)
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fit_time score_time test_score \\\n",
"dummy 0.001 (+/- 0.001) 0.001 (+/- 0.001) 0.280 (+/- 0.001) \n",
"\n",
" train_score \n",
"dummy 0.280 (+/- 0.000) "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dummy = DummyClassifier()\n",
"results[\"dummy\"] = mean_std_cross_val_scores(\n",
" dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics\n",
")\n",
"pd.DataFrame(results).T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Bag-of-words model "
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/b3/g26r0dcx4b35vf3nk31216hc0000gr/T/ipykernel_13054/4158382658.py:26: FutureWarning:\n",
"\n",
"Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
"\n"
]
},
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
"
\n",
"
\n",
"
fit_time
\n",
"
score_time
\n",
"
test_score
\n",
"
train_score
\n",
"
\n",
" \n",
" \n",
"
\n",
"
dummy
\n",
"
0.001 (+/- 0.001)
\n",
"
0.001 (+/- 0.001)
\n",
"
0.280 (+/- 0.001)
\n",
"
0.280 (+/- 0.000)
\n",
"
\n",
"
\n",
"
logistic regression
\n",
"
0.278 (+/- 0.019)
\n",
"
0.008 (+/- 0.000)
\n",
"
0.414 (+/- 0.012)
\n",
"
0.999 (+/- 0.000)
\n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fit_time score_time test_score \\\n",
"dummy 0.001 (+/- 0.001) 0.001 (+/- 0.001) 0.280 (+/- 0.001) \n",
"logistic regression 0.278 (+/- 0.019) 0.008 (+/- 0.000) 0.414 (+/- 0.012) \n",
"\n",
" train_score \n",
"dummy 0.280 (+/- 0.000) \n",
"logistic regression 0.999 (+/- 0.000) "
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"pipe = make_pipeline(CountVectorizer(stop_words='english'), \n",
" LogisticRegression(max_iter=1000))\n",
"results[\"logistic regression\"] = mean_std_cross_val_scores(\n",
" pipe, X_train['OriginalTweet'], y_train, return_train_score=True, scoring=scoring_metrics\n",
")\n",
"pd.DataFrame(results).T"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Is it possible to further improve the scores?\n",
"\n",
"- How about adding new features based on our intuitions? Let's extract our own features that might be useful for this prediction task. In other words, let's carry out **feature engineering**. \n",
"\n",
"- The code below adds some very basic length-related and sentiment features. We will be using a popular library called `nltk` for this exercise. If you have successfully created the course `conda` environment on your machine, you should already have this package in the environment. "
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"- How do we extract interesting information from text?\n",
"- We use **pre-trained models**! "
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"- A couple of popular libraries which include such pre-trained models. \n",
"- `nltk`\n",
"```\n",
"conda install -c anaconda nltk \n",
"``` \n",
"- spaCy\n",
"```\n",
"conda install -c conda-forge spacy\n",
"```\n",
"\n",
"For emoji support: \n",
"```\n",
"pip install spacymoji\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- You also need to download the language model which contains all the pre-trained models. For that run the following in your course `conda` environment or here. "
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"# !python -m spacy download en_core_web_md"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /Users/kvarada/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"\n",
"nltk.download(\"punkt\")"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package vader_lexicon to\n",
"[nltk_data] /Users/kvarada/nltk_data...\n",
"[nltk_data] Package vader_lexicon is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /Users/kvarada/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
}
],
"source": [
"nltk.download(\"vader_lexicon\")\n",
"nltk.download(\"punkt\")\n",
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
"\n",
"sid = SentimentIntensityAnalyzer()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'neg': 0.0, 'neu': 0.368, 'pos': 0.632, 'compound': 0.8225}\n"
]
}
],
"source": [
"s = \"CPSC 330 students are smart, sweet, and funny.\"\n",
"print(sid.polarity_scores(s))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'neg': 0.249, 'neu': 0.751, 'pos': 0.0, 'compound': -0.5106}\n"
]
}
],
"source": [
"s = \"CPSC 330 students are tired because of all the hard work they have been doing.\"\n",
"print(sid.polarity_scores(s))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### [spaCy](https://spacy.io/) \n",
"\n",
"A useful package for text processing and feature extraction\n",
"- Active development: https://github.com/explosion/spaCy\n",
"- Interactive lessons by Ines Montani: https://course.spacy.io/en/\n",
"- Good documentation, easy to use, and customizable.\n",
"\n",
"To run the code below, you have to download the pretrained model in the course environment. \n",
"\n",
"> python -m spacy download en_core_web_md"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"import spacy\n",
"\n",
"nlp = spacy.load(\"en_core_web_md\")"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"sample_text = \"\"\"Dolly Parton is a gift to us all. \n",
"From writing all-time great songs like “Jolene” and “I Will Always Love You”, \n",
"to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine, \n",
"she’s given us so much. Now, Netflix bring us Dolly Parton’s Christmas on the Square, \n",
"an original musical that stars Christine Baranski as a Scrooge-like landowner \n",
"who threatens to evict an entire town on Christmas Eve to make room for a new mall. \n",
"Directed and choreographed by the legendary Debbie Allen and counting Jennifer Lewis \n",
"and Parton herself amongst its cast, Christmas on the Square seems like the perfect movie\n",
"to save Christmas 2020. 😻 👍🏿\"\"\"\n",
"\n",
"# [Adapted from here.](https://thepopbreak.com/2020/11/22/dolly-partons-christmas-on-the-square-review-not-quite-a-christmas-miracle/)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"source": [
"Spacy extracts all interesting information from text with this call."
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"slideshow": {
"slide_type": "-"
}
},
"outputs": [],
"source": [
"doc = nlp(sample_text)"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Let's look at part-of-speech tags. "
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(Dolly, 'PROPN'), (Parton, 'PROPN'), (is, 'AUX'), (a, 'DET'), (gift, 'NOUN'), (to, 'ADP'), (us, 'PRON'), (all, 'PRON'), (., 'PUNCT'), (\n",
", 'SPACE'), (From, 'ADP'), (writing, 'VERB'), (all, 'DET'), (-, 'PUNCT'), (time, 'NOUN'), (great, 'ADJ'), (songs, 'NOUN'), (like, 'ADP'), (“, 'PUNCT'), (Jolene, 'PROPN')]\n"
]
}
],
"source": [
"print([(token, token.pos_) for token in doc][:20])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"- Often we want to know who did what to whom. \n",
"- **Named entities** give you this information. \n",
"- What are named entities in the text? "
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
" Dolly Parton\n",
" PERSON\n",
"\n",
" is a gift to us all. From writing all-time great songs like “\n",
"\n",
" Jolene\n",
" PERSON\n",
"\n",
"” and “I Will Always Love You”, to great performances in films like \n",
"\n",
" 9 to 5\n",
" DATE\n",
"\n",
", to helping fund a COVID-19 vaccine, she’s given us so much. Now, \n",
"\n",
" Netflix\n",
" ORG\n",
"\n",
" bring us \n",
"\n",
" Dolly Parton\n",
" PERSON\n",
"\n",
"’s \n",
"\n",
" Christmas\n",
" DATE\n",
"\n",
" on the \n",
"\n",
" Square\n",
" FAC\n",
"\n",
", an original musical that stars \n",
"\n",
" Christine Baranski\n",
" PERSON\n",
"\n",
" as a Scrooge-like landowner who threatens to evict an entire town on \n",
"\n",
" Christmas Eve\n",
" DATE\n",
"\n",
" to make room for a new mall. Directed and choreographed by the legendary \n",
"\n",
" Debbie Allen\n",
" PERSON\n",
"\n",
" and counting \n",
"\n",
" Jennifer Lewis\n",
" PERSON\n",
"\n",
" and \n",
"\n",
" Parton\n",
" PERSON\n",
"\n",
" herself amongst its cast, \n",
"\n",
" Christmas\n",
" DATE\n",
"\n",
" on the \n",
"\n",
" Square\n",
" FAC\n",
"\n",
" seems like the perfect movie to save \n",
"\n",
" Christmas 2020\n",
" DATE\n",
"\n",
". 😻 👍🏿
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from spacy import displacy\n",
"\n",
"displacy.render(doc, style=\"ent\")"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Named entities:\n",
" [('Dolly Parton', 'PERSON'), ('Jolene', 'PERSON'), ('9 to 5', 'DATE'), ('Netflix', 'ORG'), ('Dolly Parton', 'PERSON'), ('Christmas', 'DATE'), ('Square', 'FAC'), ('Christine Baranski', 'PERSON'), ('Christmas Eve', 'DATE'), ('Debbie Allen', 'PERSON'), ('Jennifer Lewis', 'PERSON'), ('Parton', 'PERSON'), ('Christmas', 'DATE'), ('Square', 'FAC'), ('Christmas 2020', 'DATE')]\n",
"\n",
"ORG means: Companies, agencies, institutions, etc.\n",
"\n",
"PERSON means: People, including fictional\n",
"\n",
"DATE means: Absolute or relative dates or periods\n"
]
}
],
"source": [
"print(\"Named entities:\\n\", [(ent.text, ent.label_) for ent in doc.ents])\n",
"print(\"\\nORG means: \", spacy.explain(\"ORG\"))\n",
"print(\"\\nPERSON means: \", spacy.explain(\"PERSON\"))\n",
"print(\"\\nDATE means: \", spacy.explain(\"DATE\"))"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"### An example from a project \n",
"\n",
"Goal: Extract and visualize inter-corporate relationships from disclosed annual 10-K reports of public companies. \n",
"\n",
"[Source for the text below.](https://www.bbc.com/news/business-39875417)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"text = (\n",
" \"Heavy hitters, including Microsoft and Google, \"\n",
" \"are competing for customers in cloud services with the likes of IBM and Salesforce.\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
Heavy hitters, including \n",
"\n",
" Microsoft\n",
" ORG\n",
"\n",
" and \n",
"\n",
" Google\n",
" ORG\n",
"\n",
", are competing for customers in cloud services with the likes of \n",
"\n",
" IBM\n",
" ORG\n",
"\n",
" and \n",
"\n",
" Salesforce\n",
" PRODUCT\n",
"\n",
".
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Named entities:\n",
" [('Microsoft', 'ORG'), ('Google', 'ORG'), ('IBM', 'ORG'), ('Salesforce', 'PRODUCT')]\n"
]
}
],
"source": [
"doc = nlp(text)\n",
"displacy.render(doc, style=\"ent\")\n",
"print(\"Named entities:\\n\", [(ent.text, ent.label_) for ent in doc.ents])"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"If you want emoji identification support install [`spacymoji`](https://pypi.org/project/spacymoji/) in the course environment. \n",
"\n",
"```\n",
"pip install spacymoji\n",
"```\n",
"\n",
"After installing `spacymoji`, if it's still complaining about module not found, my guess is that you do not have `pip` installed in your `conda` environment. Go to your course `conda` environment install `pip` and install the `spacymoji` package in the environment using the `pip` you just installed in the current environment. \n",
"\n",
"```\n",
"conda install pip\n",
"YOUR_MINICONDA_PATH/miniconda3/envs/cpsc330/bin/pip install spacymoji\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"outputs": [],
"source": [
"from spacymoji import Emoji\n",
"\n",
"nlp.add_pipe(\"emoji\", first=True);"
]
},
{
"cell_type": "markdown",
"metadata": {
"slideshow": {
"slide_type": "slide"
}
},
"source": [
"Does the text have any emojis? If yes, extract the description. "
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('😻', 138, 'smiling cat with heart-eyes'),\n",
" ('👍🏿', 139, 'thumbs up dark skin tone')]"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doc = nlp(sample_text)\n",
"doc._.emoji"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"
"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Simple feature engineering for our problem. "
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"import en_core_web_md\n",
"import spacy\n",
"\n",
"nlp = en_core_web_md.load()\n",
"from spacymoji import Emoji\n",
"\n",
"nlp.add_pipe(\"emoji\", first=True)\n",
"\n",
"def get_relative_length(text, TWITTER_ALLOWED_CHARS=280.0):\n",
" \"\"\"\n",
" Returns the relative length of text.\n",
"\n",
" Parameters:\n",
" ------\n",
" text: (str)\n",
" the input text\n",
"\n",
" Keyword arguments:\n",
" ------\n",
" TWITTER_ALLOWED_CHARS: (float)\n",
" the denominator for finding relative length\n",
"\n",
" Returns:\n",
" -------\n",
" relative length of text: (float)\n",
"\n",
" \"\"\"\n",
" return len(text) / TWITTER_ALLOWED_CHARS\n",
"\n",
"\n",
"def get_length_in_words(text):\n",
" \"\"\"\n",
" Returns the length of the text in words.\n",
"\n",
" Parameters:\n",
" ------\n",
" text: (str)\n",
" the input text\n",
"\n",
" Returns:\n",
" -------\n",
" length of tokenized text: (int)\n",
"\n",
" \"\"\"\n",
" return len(nltk.word_tokenize(text))\n",
"\n",
"\n",
"def get_sentiment(text):\n",
" \"\"\"\n",
" Returns the compound score representing the sentiment: -1 (most extreme negative) and +1 (most extreme positive)\n",
" The compound score is a normalized score calculated by summing the valence scores of each word in the lexicon.\n",
"\n",
" Parameters:\n",
" ------\n",
" text: (str)\n",
" the input text\n",
"\n",
" Returns:\n",
" -------\n",
" sentiment of the text: (str)\n",
" \"\"\"\n",
" scores = sid.polarity_scores(text)\n",
" return scores[\"compound\"]\n",
"\n",
"def get_avg_word_length(text):\n",
" \"\"\"\n",
" Returns the average word length of the given text.\n",
"\n",
" Parameters:\n",
" text -- (str)\n",
" \"\"\"\n",
" words = text.split()\n",
" return sum(len(word) for word in words) / len(words)\n",
"\n",
"\n",
"def has_emoji(text):\n",
" \"\"\"\n",
" Returns the average word length of the given text.\n",
"\n",
" Parameters:\n",
" text -- (str)\n",
" \"\"\"\n",
" doc = nlp(text)\n",
" return 1 if doc._.has_emoji else 0"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt_tab to\n",
"[nltk_data] /Users/kvarada/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt_tab.zip.\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import nltk\n",
"nltk.download('punkt_tab')"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.