{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Appendix A: Demo of feature engineering for text data " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Import " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We will be using [Covid tweets](https://www.kaggle.com/code/kerneler/starter-covid-19-nlp-text-d3a3baa6-e/data) dataset for this. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "jupyter": { "source_hidden": true } }, "outputs": [], "source": [ "import os\n", "import sys\n", "\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import numpy.random as npr\n", "import pandas as pd\n", "from sklearn.compose import (\n", " ColumnTransformer,\n", " TransformedTargetRegressor,\n", " make_column_transformer,\n", ")\n", "sys.path.append(os.path.join(os.path.abspath(\"..\"), \"code\"))\n", "from sklearn.dummy import DummyClassifier, DummyRegressor\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeCV\n", "from sklearn.metrics import make_scorer, mean_squared_error, r2_score\n", "from sklearn.model_selection import cross_val_score, cross_validate, train_test_split\n", "from sklearn.pipeline import Pipeline, make_pipeline\n", "from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler\n", "DATA_DIR = os.path.join(os.path.abspath(\"..\"), \"data/\")\n", "from sklearn.svm import SVC" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sentiment\n", "Negative 1041\n", "Positive 947\n", "Neutral 619\n", "Extremely Positive 599\n", "Extremely Negative 592\n", "Name: count, dtype: int64" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(DATA_DIR + 'Corona_NLP_test.csv')\n", "df['Sentiment'].value_counts()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserNameScreenNameLocationTweetAtOriginalTweetSentiment
1927192846880Seattle, WA13-03-2020While I don't like all of Amazon's choices, to...Positive
1068106946021NaN13-03-2020Me: shit buckets, it’s time to do the weekly s...Negative
80380445756The Outer Limits12-03-2020@SecPompeo @realDonaldTrump You mean the plan ...Neutral
2846284747799Flagstaff, AZ15-03-2020@lauvagrande People who are sick aren’t panic ...Extremely Negative
3768376948721Montreal, Canada16-03-2020Coronavirus Panic: Toilet Paper Is the “People...Negative
.....................
1122112346075NaN13-03-2020Photos of our local grocery store shelves—wher...Extremely Positive
1346134746299Toronto13-03-2020Just went to the the grocery store (Highland F...Positive
3454345548407Houston, TX16-03-2020Real talk though. Am I the only one spending h...Neutral
3437343848390Washington, DC16-03-2020The supermarket business is booming! #COVID2019Neutral
3582358348535St James' Park, Newcastle16-03-2020Evening All Here s the story on the and the im...Positive
\n", "

3038 rows × 6 columns

\n", "
" ], "text/plain": [ " UserName ScreenName Location TweetAt \\\n", "1927 1928 46880 Seattle, WA 13-03-2020 \n", "1068 1069 46021 NaN 13-03-2020 \n", "803 804 45756 The Outer Limits 12-03-2020 \n", "2846 2847 47799 Flagstaff, AZ 15-03-2020 \n", "3768 3769 48721 Montreal, Canada 16-03-2020 \n", "... ... ... ... ... \n", "1122 1123 46075 NaN 13-03-2020 \n", "1346 1347 46299 Toronto 13-03-2020 \n", "3454 3455 48407 Houston, TX 16-03-2020 \n", "3437 3438 48390 Washington, DC 16-03-2020 \n", "3582 3583 48535 St James' Park, Newcastle 16-03-2020 \n", "\n", " OriginalTweet Sentiment \n", "1927 While I don't like all of Amazon's choices, to... Positive \n", "1068 Me: shit buckets, it’s time to do the weekly s... Negative \n", "803 @SecPompeo @realDonaldTrump You mean the plan ... Neutral \n", "2846 @lauvagrande People who are sick aren’t panic ... Extremely Negative \n", "3768 Coronavirus Panic: Toilet Paper Is the “People... Negative \n", "... ... ... \n", "1122 Photos of our local grocery store shelves—wher... Extremely Positive \n", "1346 Just went to the the grocery store (Highland F... Positive \n", "3454 Real talk though. Am I the only one spending h... Neutral \n", "3437 The supermarket business is booming! #COVID2019 Neutral \n", "3582 Evening All Here s the story on the and the im... Positive \n", "\n", "[3038 rows x 6 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet',\n", " 'Sentiment'],\n", " dtype='object')" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.columns" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Location\n", "United States 63\n", "London, England 37\n", "Los Angeles, CA 30\n", "New York, NY 29\n", "Washington, DC 29\n", " ..\n", "Suburb of Chicago 1\n", "philippines 1\n", "Dont ask for freedom, take it. 1\n", "Windsor Heights, IA 1\n", "St James' Park, Newcastle 1\n", "Name: count, Length: 1441, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df['Location'].value_counts()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "X_train, y_train = train_df[['OriginalTweet', 'Location']], train_df['Sentiment']\n", "X_test, y_test = test_df[['OriginalTweet', 'Location']], test_df['Sentiment']" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sentiment\n", "Negative 852\n", "Positive 743\n", "Neutral 501\n", "Extremely Negative 472\n", "Extremely Positive 470\n", "Name: count, dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train.value_counts()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "scoring_metrics = 'accuracy'" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "results = {}" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):\n", " \"\"\"\n", " Returns mean and std of cross validation\n", "\n", " Parameters\n", " ----------\n", " model :\n", " scikit-learn model\n", " X_train : numpy array or pandas DataFrame\n", " X in the training data\n", " y_train :\n", " y in the training data\n", "\n", " Returns\n", " ----------\n", " pandas Series with mean scores from cross_validation\n", " \"\"\"\n", "\n", " scores = cross_validate(model, X_train, y_train, **kwargs)\n", "\n", " mean_scores = pd.DataFrame(scores).mean()\n", " std_scores = pd.DataFrame(scores).std()\n", " out_col = []\n", "\n", " for i in range(len(mean_scores)):\n", " out_col.append((f\"%0.3f (+/- %0.3f)\" % (mean_scores.iloc[i], std_scores.iloc[i])))\n", "\n", " return pd.Series(data=out_col, index=mean_scores.index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dummy classifier" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.000 (+/- 0.000)0.280 (+/- 0.001)0.280 (+/- 0.000)
\n", "
" ], "text/plain": [ " fit_time score_time test_score \\\n", "dummy 0.001 (+/- 0.000) 0.000 (+/- 0.000) 0.280 (+/- 0.001) \n", "\n", " train_score \n", "dummy 0.280 (+/- 0.000) " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dummy = DummyClassifier()\n", "results[\"dummy\"] = mean_std_cross_val_scores(\n", " dummy, X_train, y_train, return_train_score=True, scoring=scoring_metrics\n", ")\n", "pd.DataFrame(results).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Bag-of-words model " ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.000 (+/- 0.000)0.280 (+/- 0.001)0.280 (+/- 0.000)
logistic regression0.306 (+/- 0.015)0.008 (+/- 0.000)0.414 (+/- 0.012)0.999 (+/- 0.000)
\n", "
" ], "text/plain": [ " fit_time score_time test_score \\\n", "dummy 0.001 (+/- 0.000) 0.000 (+/- 0.000) 0.280 (+/- 0.001) \n", "logistic regression 0.306 (+/- 0.015) 0.008 (+/- 0.000) 0.414 (+/- 0.012) \n", "\n", " train_score \n", "dummy 0.280 (+/- 0.000) \n", "logistic regression 0.999 (+/- 0.000) " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "pipe = make_pipeline(CountVectorizer(stop_words='english'), \n", " LogisticRegression(max_iter=1000))\n", "results[\"logistic regression\"] = mean_std_cross_val_scores(\n", " pipe, X_train['OriginalTweet'], y_train, return_train_score=True, scoring=scoring_metrics\n", ")\n", "pd.DataFrame(results).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Is it possible to further improve the scores?\n", "\n", "- How about adding new features based on our intuitions? Let's extract our own features that might be useful for this prediction task. In other words, let's carry out **feature engineering**. \n", "\n", "- The code below adds some very basic length-related and sentiment features. We will be using a popular library called `nltk` for this exercise. If you have successfully created the course `conda` environment on your machine, you should already have this package in the environment. " ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "- How do we extract interesting information from text?\n", "- We use **pre-trained models**! " ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "- A couple of popular libraries which include such pre-trained models. \n", "- `nltk`\n", "```\n", "conda install -c anaconda nltk \n", "``` \n", "- spaCy\n", "```\n", "conda install -c conda-forge spacy\n", "```\n", "\n", "For emoji support: \n", "```\n", "pip install spacymoji\n", "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "- You also need to download the language model which contains all the pre-trained models. For that run the following in your course `conda` environment or here. " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "\n", "A module that was compiled using NumPy 1.x cannot be run in\n", "NumPy 2.2.3 as it may crash. To support both 1.x and 2.x\n", "versions of NumPy, modules must be compiled with NumPy 2.0.\n", "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n", "\n", "If you are a user of the module, the easiest solution will be to\n", "downgrade to 'numpy<2' or try to upgrade the affected module.\n", "We expect that some modules will need time to support NumPy 2.\n", "\n", "Traceback (most recent call last): File \"\", line 198, in _run_module_as_main\n", " File \"\", line 88, in _run_code\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel_launcher.py\", line 18, in \n", " app.launch_new_instance()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/traitlets/config/application.py\", line 1075, in launch_instance\n", " app.start()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelapp.py\", line 739, in start\n", " self.io_loop.start()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/tornado/platform/asyncio.py\", line 205, in start\n", " self.asyncio_loop.run_forever()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/base_events.py\", line 640, in run_forever\n", " self._run_once()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/base_events.py\", line 1992, in _run_once\n", " handle._run()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/asyncio/events.py\", line 88, in _run\n", " self._context.run(self._callback, *self._args)\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 545, in dispatch_queue\n", " await self.process_one()\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 534, in process_one\n", " await dispatch(*args)\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 437, in dispatch_shell\n", " await result\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/ipkernel.py\", line 362, in execute_request\n", " await super().execute_request(stream, ident, parent)\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/kernelbase.py\", line 778, in execute_request\n", " reply_content = await reply_content\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/ipkernel.py\", line 449, in do_execute\n", " res = shell.run_cell(\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/ipykernel/zmqshell.py\", line 549, in run_cell\n", " return super().run_cell(*args, **kwargs)\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3075, in run_cell\n", " result = self._run_cell(\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3130, in _run_cell\n", " result = runner(coro)\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/async_helpers.py\", line 128, in _pseudo_sync_runner\n", " coro.send(None)\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3334, in run_cell_async\n", " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3517, in run_ast_nodes\n", " if await self.run_code(code, result, async_=asy):\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/IPython/core/interactiveshell.py\", line 3577, in run_code\n", " exec(code_obj, self.user_global_ns, self.user_ns)\n", " File \"/var/folders/j6/dt88trtd17lf726d55bq16c40000gr/T/ipykernel_86208/456904786.py\", line 1, in \n", " import spacy\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/__init__.py\", line 6, in \n", " from .errors import setup_default_warnings\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/errors.py\", line 3, in \n", " from .compat import Literal\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/spacy/compat.py\", line 4, in \n", " from thinc.util import copy_array\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/__init__.py\", line 5, in \n", " from .config import registry\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/config.py\", line 5, in \n", " from .types import Decorator\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/types.py\", line 25, in \n", " from .compat import cupy, has_cupy\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/thinc/compat.py\", line 35, in \n", " import torch\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/__init__.py\", line 1477, in \n", " from .functional import * # noqa: F403\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/functional.py\", line 9, in \n", " import torch.nn.functional as F\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/__init__.py\", line 1, in \n", " from .modules import * # noqa: F403\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/__init__.py\", line 35, in \n", " from .transformer import TransformerEncoder, TransformerDecoder, \\\n", " File \"/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/transformer.py\", line 20, in \n", " device: torch.device = torch.device(torch._C._get_default_device()), # torch.device('cpu'),\n", "/Users/mathias/miniconda3/envs/cpsc330/lib/python3.12/site-packages/torch/nn/modules/transformer.py:20: UserWarning: Failed to initialize NumPy: _ARRAY_API not found (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1711403226120/work/torch/csrc/utils/tensor_numpy.cpp:84.)\n", " device: torch.device = torch.device(torch._C._get_default_device()), # torch.device('cpu'),\n" ] } ], "source": [ "import spacy\n", "\n", "# !python -m spacy download en_core_web_md" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "\n", "nltk.download(\"punkt\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package vader_lexicon to\n", "[nltk_data] /Users/mathias/nltk_data...\n", "[nltk_data] Package vader_lexicon is already up-to-date!\n", "[nltk_data] Downloading package punkt to /Users/mathias/nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n" ] } ], "source": [ "nltk.download(\"vader_lexicon\")\n", "nltk.download(\"punkt\")\n", "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "\n", "sid = SentimentIntensityAnalyzer()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'neg': 0.0, 'neu': 0.368, 'pos': 0.632, 'compound': 0.8225}\n" ] } ], "source": [ "s = \"CPSC 330 students are smart, sweet, and funny.\"\n", "print(sid.polarity_scores(s))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'neg': 0.249, 'neu': 0.751, 'pos': 0.0, 'compound': -0.5106}\n" ] } ], "source": [ "s = \"CPSC 330 students are tired because of all the hard work they have been doing.\"\n", "print(sid.polarity_scores(s))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### [spaCy](https://spacy.io/) \n", "\n", "A useful package for text processing and feature extraction\n", "- Active development: https://github.com/explosion/spaCy\n", "- Interactive lessons by Ines Montani: https://course.spacy.io/en/\n", "- Good documentation, easy to use, and customizable.\n", "\n", "To run the code below, you have to download the pretrained model in the course environment. \n", "\n", "> python -m spacy download en_core_web_md" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "import spacy\n", "\n", "nlp = spacy.load(\"en_core_web_md\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "sample_text = \"\"\"Dolly Parton is a gift to us all. \n", "From writing all-time great songs like “Jolene” and “I Will Always Love You”, \n", "to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine, \n", "she’s given us so much. Now, Netflix bring us Dolly Parton’s Christmas on the Square, \n", "an original musical that stars Christine Baranski as a Scrooge-like landowner \n", "who threatens to evict an entire town on Christmas Eve to make room for a new mall. \n", "Directed and choreographed by the legendary Debbie Allen and counting Jennifer Lewis \n", "and Parton herself amongst its cast, Christmas on the Square seems like the perfect movie\n", "to save Christmas 2020. 😻 👍🏿\"\"\"\n", "\n", "# [Adapted from here.](https://thepopbreak.com/2020/11/22/dolly-partons-christmas-on-the-square-review-not-quite-a-christmas-miracle/)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "-" } }, "source": [ "Spacy extracts all interesting information from text with this call." ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "slideshow": { "slide_type": "-" } }, "outputs": [], "source": [ "doc = nlp(sample_text)" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Let's look at part-of-speech tags. " ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(Dolly, 'PROPN'), (Parton, 'PROPN'), (is, 'AUX'), (a, 'DET'), (gift, 'NOUN'), (to, 'ADP'), (us, 'PRON'), (all, 'PRON'), (., 'PUNCT'), (\n", ", 'SPACE'), (From, 'ADP'), (writing, 'VERB'), (all, 'DET'), (-, 'PUNCT'), (time, 'NOUN'), (great, 'ADJ'), (songs, 'NOUN'), (like, 'ADP'), (“, 'PUNCT'), (Jolene, 'PROPN')]\n" ] } ], "source": [ "print([(token, token.pos_) for token in doc][:20])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "- Often we want to know who did what to whom. \n", "- **Named entities** give you this information. \n", "- What are named entities in the text? " ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " Dolly Parton\n", " PERSON\n", "\n", " is a gift to us all.
From writing all-time great songs like “\n", "\n", " Jolene\n", " PERSON\n", "\n", "” and “\n", "\n", " I Will Always Love You\n", " WORK_OF_ART\n", "\n", "”,
to great performances in films like 9 to 5, to helping fund a COVID-19 vaccine,
she’s given us so much. Now, \n", "\n", " Netflix\n", " ORG\n", "\n", " bring us \n", "\n", " Dolly Parton’s\n", " PERSON\n", "\n", " \n", "\n", " Christmas\n", " DATE\n", "\n", " on the Square,
an original musical that stars \n", "\n", " Christine Baranski\n", " PERSON\n", "\n", " as a Scrooge-like landowner
who threatens to evict an entire town on \n", "\n", " Christmas Eve\n", " DATE\n", "\n", " to make room for a new mall.
Directed and choreographed by the legendary \n", "\n", " Debbie Allen\n", " PERSON\n", "\n", " and counting \n", "\n", " Jennifer Lewis\n", " PERSON\n", "\n", "
and \n", "\n", " Parton\n", " PERSON\n", "\n", " herself amongst its cast, \n", "\n", " Christmas\n", " DATE\n", "\n", " on the Square seems like the perfect movie
to save \n", "\n", " Christmas 2020\n", " DATE\n", "\n", ". 😻 👍🏿
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from spacy import displacy\n", "\n", "displacy.render(doc, style=\"ent\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Named entities:\n", " [('Dolly Parton', 'PERSON'), ('Jolene', 'PERSON'), ('I Will Always Love You', 'WORK_OF_ART'), ('Netflix', 'ORG'), ('Dolly Parton’s', 'PERSON'), ('Christmas', 'DATE'), ('Christine Baranski', 'PERSON'), ('Christmas Eve', 'DATE'), ('Debbie Allen', 'PERSON'), ('Jennifer Lewis', 'PERSON'), ('Parton', 'PERSON'), ('Christmas', 'DATE'), ('Christmas 2020', 'DATE')]\n", "\n", "ORG means: Companies, agencies, institutions, etc.\n", "\n", "PERSON means: People, including fictional\n", "\n", "DATE means: Absolute or relative dates or periods\n" ] } ], "source": [ "print(\"Named entities:\\n\", [(ent.text, ent.label_) for ent in doc.ents])\n", "print(\"\\nORG means: \", spacy.explain(\"ORG\"))\n", "print(\"\\nPERSON means: \", spacy.explain(\"PERSON\"))\n", "print(\"\\nDATE means: \", spacy.explain(\"DATE\"))" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "### An example from a project \n", "\n", "Goal: Extract and visualize inter-corporate relationships from disclosed annual 10-K reports of public companies. \n", "\n", "[Source for the text below.](https://www.bbc.com/news/business-39875417)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "text = (\n", " \"Heavy hitters, including Microsoft and Google, \"\n", " \"are competing for customers in cloud services with the likes of IBM and Salesforce.\"\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Heavy hitters, including \n", "\n", " Microsoft\n", " ORG\n", "\n", " and \n", "\n", " Google\n", " ORG\n", "\n", ", are competing for customers in cloud services with the likes of \n", "\n", " IBM\n", " ORG\n", "\n", " and \n", "\n", " Salesforce\n", " PERSON\n", "\n", ".
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Named entities:\n", " [('Microsoft', 'ORG'), ('Google', 'ORG'), ('IBM', 'ORG'), ('Salesforce', 'PERSON')]\n" ] } ], "source": [ "doc = nlp(text)\n", "displacy.render(doc, style=\"ent\")\n", "print(\"Named entities:\\n\", [(ent.text, ent.label_) for ent in doc.ents])" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "If you want emoji identification support install [`spacymoji`](https://pypi.org/project/spacymoji/) in the course environment. \n", "\n", "```\n", "pip install spacymoji\n", "```\n", "\n", "After installing `spacymoji`, if it's still complaining about module not found, my guess is that you do not have `pip` installed in your `conda` environment. Go to your course `conda` environment install `pip` and install the `spacymoji` package in the environment using the `pip` you just installed in the current environment. \n", "\n", "```\n", "conda install pip\n", "YOUR_MINICONDA_PATH/miniconda3/envs/cpsc330/bin/pip install spacymoji\n", "```" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "slideshow": { "slide_type": "slide" } }, "outputs": [], "source": [ "from spacymoji import Emoji\n", "\n", "nlp.add_pipe(\"emoji\", first=True);" ] }, { "cell_type": "markdown", "metadata": { "slideshow": { "slide_type": "slide" } }, "source": [ "Does the text have any emojis? If yes, extract the description. " ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('😻', 138, 'smiling cat with heart-eyes'),\n", " ('👍🏿', 139, 'thumbs up dark skin tone')]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "doc = nlp(sample_text)\n", "doc._.emoji" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "



" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Simple feature engineering for our problem. " ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "import en_core_web_md\n", "import spacy\n", "\n", "nlp = en_core_web_md.load()\n", "from spacymoji import Emoji\n", "\n", "nlp.add_pipe(\"emoji\", first=True)\n", "\n", "def get_relative_length(text, TWITTER_ALLOWED_CHARS=280.0):\n", " \"\"\"\n", " Returns the relative length of text.\n", "\n", " Parameters:\n", " ------\n", " text: (str)\n", " the input text\n", "\n", " Keyword arguments:\n", " ------\n", " TWITTER_ALLOWED_CHARS: (float)\n", " the denominator for finding relative length\n", "\n", " Returns:\n", " -------\n", " relative length of text: (float)\n", "\n", " \"\"\"\n", " return len(text) / TWITTER_ALLOWED_CHARS\n", "\n", "\n", "def get_length_in_words(text):\n", " \"\"\"\n", " Returns the length of the text in words.\n", "\n", " Parameters:\n", " ------\n", " text: (str)\n", " the input text\n", "\n", " Returns:\n", " -------\n", " length of tokenized text: (int)\n", "\n", " \"\"\"\n", " return len(nltk.word_tokenize(text))\n", "\n", "\n", "def get_sentiment(text):\n", " \"\"\"\n", " Returns the compound score representing the sentiment: -1 (most extreme negative) and +1 (most extreme positive)\n", " The compound score is a normalized score calculated by summing the valence scores of each word in the lexicon.\n", "\n", " Parameters:\n", " ------\n", " text: (str)\n", " the input text\n", "\n", " Returns:\n", " -------\n", " sentiment of the text: (str)\n", " \"\"\"\n", " scores = sid.polarity_scores(text)\n", " return scores[\"compound\"]\n", "\n", "def get_avg_word_length(text):\n", " \"\"\"\n", " Returns the average word length of the given text.\n", "\n", " Parameters:\n", " text -- (str)\n", " \"\"\"\n", " words = text.split()\n", " return sum(len(word) for word in words) / len(words)\n", "\n", "\n", "def has_emoji(text):\n", " \"\"\"\n", " Returns the average word length of the given text.\n", "\n", " Parameters:\n", " text -- (str)\n", " \"\"\"\n", " doc = nlp(text)\n", " return 1 if doc._.has_emoji else 0" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] /Users/mathias/nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n" ] }, { "data": { "text/plain": [ "True" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import nltk\n", "nltk.download('punkt_tab')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
UserNameScreenNameLocationTweetAtOriginalTweetSentimentn_wordsvader_sentimentrel_char_lenaverage_word_lengthall_capshas_emoji
1927192846880Seattle, WA13-03-2020While I don't like all of Amazon's choices, to...Positive31-0.10530.5892865.64000000
1068106946021NaN13-03-2020Me: shit buckets, it’s time to do the weekly s...Negative52-0.25000.9321434.63636400
80380445756The Outer Limits12-03-2020@SecPompeo @realDonaldTrump You mean the plan ...Neutral440.00000.9107146.74193500
2846284747799Flagstaff, AZ15-03-2020@lauvagrande People who are sick aren’t panic ...Extremely Negative46-0.84810.9071435.02381000
3768376948721Montreal, Canada16-03-2020Coronavirus Panic: Toilet Paper Is the “People...Negative21-0.51060.5000009.84615400
\n", "
" ], "text/plain": [ " UserName ScreenName Location TweetAt \\\n", "1927 1928 46880 Seattle, WA 13-03-2020 \n", "1068 1069 46021 NaN 13-03-2020 \n", "803 804 45756 The Outer Limits 12-03-2020 \n", "2846 2847 47799 Flagstaff, AZ 15-03-2020 \n", "3768 3769 48721 Montreal, Canada 16-03-2020 \n", "\n", " OriginalTweet Sentiment \\\n", "1927 While I don't like all of Amazon's choices, to... Positive \n", "1068 Me: shit buckets, it’s time to do the weekly s... Negative \n", "803 @SecPompeo @realDonaldTrump You mean the plan ... Neutral \n", "2846 @lauvagrande People who are sick aren’t panic ... Extremely Negative \n", "3768 Coronavirus Panic: Toilet Paper Is the “People... Negative \n", "\n", " n_words vader_sentiment rel_char_len average_word_length all_caps \\\n", "1927 31 -0.1053 0.589286 5.640000 0 \n", "1068 52 -0.2500 0.932143 4.636364 0 \n", "803 44 0.0000 0.910714 6.741935 0 \n", "2846 46 -0.8481 0.907143 5.023810 0 \n", "3768 21 -0.5106 0.500000 9.846154 0 \n", "\n", " has_emoji \n", "1927 0 \n", "1068 0 \n", "803 0 \n", "2846 0 \n", "3768 0 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df = train_df.assign(n_words=train_df[\"OriginalTweet\"].apply(get_length_in_words))\n", "train_df = train_df.assign(vader_sentiment=train_df[\"OriginalTweet\"].apply(get_sentiment))\n", "train_df = train_df.assign(rel_char_len=train_df[\"OriginalTweet\"].apply(get_relative_length))\n", "\n", "test_df = test_df.assign(n_words=test_df[\"OriginalTweet\"].apply(get_length_in_words))\n", "test_df = test_df.assign(vader_sentiment=test_df[\"OriginalTweet\"].apply(get_sentiment))\n", "test_df = test_df.assign(rel_char_len=test_df[\"OriginalTweet\"].apply(get_relative_length))\n", "\n", "\n", "train_df = train_df.assign(\n", " average_word_length=train_df[\"OriginalTweet\"].apply(get_avg_word_length)\n", ")\n", "test_df = test_df.assign(average_word_length=test_df[\"OriginalTweet\"].apply(get_avg_word_length))\n", "\n", "# whether all letters are uppercase or not (all_caps)\n", "train_df = train_df.assign(\n", " all_caps=train_df[\"OriginalTweet\"].apply(lambda x: 1 if x.isupper() else 0)\n", ")\n", "test_df = test_df.assign(\n", " all_caps=test_df[\"OriginalTweet\"].apply(lambda x: 1 if x.isupper() else 0)\n", ")\n", "\n", "train_df = train_df.assign(has_emoji=train_df[\"OriginalTweet\"].apply(has_emoji))\n", "test_df = test_df.assign(has_emoji=test_df[\"OriginalTweet\"].apply(has_emoji))\n", "\n", "train_df.head()" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3038, 12)" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_df.shape" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "np.int64(0)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "(train_df['all_caps'] == 1).sum()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "X_train = train_df.drop(columns=['Sentiment'])" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "numeric_features = ['vader_sentiment', \n", " 'rel_char_len', \n", " 'average_word_length']\n", "passthrough_features = ['all_caps', 'has_emoji'] \n", "text_feature = 'OriginalTweet'\n", "drop_features = ['UserName', 'ScreenName', 'Location', 'TweetAt']" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "preprocessor = make_column_transformer(\n", " (StandardScaler(), numeric_features),\n", " (\"passthrough\", passthrough_features), \n", " (CountVectorizer(stop_words='english'), text_feature),\n", " (\"drop\", drop_features)\n", ")" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fit_timescore_timetest_scoretrain_score
dummy0.001 (+/- 0.000)0.000 (+/- 0.000)0.280 (+/- 0.001)0.280 (+/- 0.000)
logistic regression0.306 (+/- 0.015)0.008 (+/- 0.000)0.414 (+/- 0.012)0.999 (+/- 0.000)
LR (more feats)0.296 (+/- 0.021)0.010 (+/- 0.001)0.690 (+/- 0.007)0.998 (+/- 0.001)
\n", "
" ], "text/plain": [ " fit_time score_time test_score \\\n", "dummy 0.001 (+/- 0.000) 0.000 (+/- 0.000) 0.280 (+/- 0.001) \n", "logistic regression 0.306 (+/- 0.015) 0.008 (+/- 0.000) 0.414 (+/- 0.012) \n", "LR (more feats) 0.296 (+/- 0.021) 0.010 (+/- 0.001) 0.690 (+/- 0.007) \n", "\n", " train_score \n", "dummy 0.280 (+/- 0.000) \n", "logistic regression 0.999 (+/- 0.000) \n", "LR (more feats) 0.998 (+/- 0.001) " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))\n", "results[\"LR (more feats)\"] = mean_std_cross_val_scores(\n", " pipe, X_train, y_train, return_train_score=True, scoring=scoring_metrics\n", ")\n", "pd.DataFrame(results).T" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('columntransformer',\n",
       "                 ColumnTransformer(transformers=[('standardscaler',\n",
       "                                                  StandardScaler(),\n",
       "                                                  ['vader_sentiment',\n",
       "                                                   'rel_char_len',\n",
       "                                                   'average_word_length']),\n",
       "                                                 ('passthrough', 'passthrough',\n",
       "                                                  ['all_caps', 'has_emoji']),\n",
       "                                                 ('countvectorizer',\n",
       "                                                  CountVectorizer(stop_words='english'),\n",
       "                                                  'OriginalTweet'),\n",
       "                                                 ('drop', 'drop',\n",
       "                                                  ['UserName', 'ScreenName',\n",
       "                                                   'Location', 'TweetAt'])])),\n",
       "                ('logisticregression', LogisticRegression(max_iter=1000))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('columntransformer',\n", " ColumnTransformer(transformers=[('standardscaler',\n", " StandardScaler(),\n", " ['vader_sentiment',\n", " 'rel_char_len',\n", " 'average_word_length']),\n", " ('passthrough', 'passthrough',\n", " ['all_caps', 'has_emoji']),\n", " ('countvectorizer',\n", " CountVectorizer(stop_words='english'),\n", " 'OriginalTweet'),\n", " ('drop', 'drop',\n", " ['UserName', 'ScreenName',\n", " 'Location', 'TweetAt'])])),\n", " ('logisticregression', LogisticRegression(max_iter=1000))])" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "cv_feats = pipe.named_steps['columntransformer'].named_transformers_['countvectorizer'].get_feature_names_out().tolist()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "feat_names = numeric_features + passthrough_features + cv_feats" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "coefs = pipe.named_steps['logisticregression'].coef_[0]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featurescoefficients
0vader_sentiment-6.167241
11331won-1.384111
2551coronapocalypse-0.817034
2214closed-0.754165
8661retail-0.729109
.........
9862stupid1.157157
3299don1.162007
4879hell1.312696
3129die1.365420
7504panic1.539459
\n", "

11664 rows × 2 columns

\n", "
" ], "text/plain": [ " features coefficients\n", "0 vader_sentiment -6.167241\n", "11331 won -1.384111\n", "2551 coronapocalypse -0.817034\n", "2214 closed -0.754165\n", "8661 retail -0.729109\n", "... ... ...\n", "9862 stupid 1.157157\n", "3299 don 1.162007\n", "4879 hell 1.312696\n", "3129 die 1.365420\n", "7504 panic 1.539459\n", "\n", "[11664 rows x 2 columns]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(\n", " data={\n", " \"features\": feat_names,\n", " \"coefficients\": coefs,\n", " }\n", ")\n", "df.sort_values('coefficients')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We get some improvements with our engineered features! " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "



" ] } ], "metadata": { "anaconda-cloud": {}, "kernelspec": { "display_name": "Python [conda env:cpsc330]", "language": "python", "name": "conda-env-cpsc330-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.8" }, "vscode": { "interpreter": { "hash": "f821000d0c0da66e5bcde88c37d59c8e0de03b40667fb62009a8148ca49465a0" } } }, "nbformat": 4, "nbformat_minor": 4 }