{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Week12_Assignment.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "5vfSwXmsxAMo", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "fe68b61c-5f6e-48ff-c524-3026dc54d539" }, "source": [ "from google.colab import drive\n", "drive.mount('/data/')\n", "data_dir = '/data/My Drive/Colab Notebooks/Experiment'\n", "!ls '/data/My Drive/Colab Notebooks/Experiment'\n", "!pip install matplotlib" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Mounted at /data/\n", "diamonds.csv m_data.csv TSLA.csv\t w_data.csv\n", "Iris.csv news_data.csv USvideos.csv\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dkvvea3LO_S1", "outputId": "afa6c061-5ef1-481c-b689-8df472afee7d" }, "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(data_dir + '/USvideos.csv')\n", "print(df.shape)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "(40949, 16)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "of76mCxFPDFq" }, "source": [ " df['text'] = df['title']+df['tags']+df['channel_title']" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DJ7HlXeoWkqp", "outputId": "97b83fdb-a8dc-47b9-b853-8a1cdd30ad89" }, "source": [ "import nltk\n", "from nltk.stem import *\n", "nltk.download('punkt')\n", "from nltk.tokenize import RegexpTokenizer\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "tqdm.pandas()\n", "from functools import reduce\n", "import re" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ro6ryJpWXmTd", "outputId": "b58c0543-c9ce-4f33-9894-c33e00987568" }, "source": [ "import nltk\n", "nltk.download('stopwords')\n", "\n", "from nltk.corpus import stopwords\n", "stopwords.words('english')\n", "\n", "en_stops = set(stopwords.words('english'))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "-Ual81F5Xivs", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cc04fdb9-a4f1-4625-e5f9-b013de0e6388" }, "source": [ "df.isnull().sum()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "video_id 0\n", "trending_date 0\n", "title 0\n", "channel_title 0\n", "category_id 0\n", "publish_time 0\n", "tags 0\n", "views 0\n", "likes 0\n", "dislikes 0\n", "comment_count 0\n", "thumbnail_link 0\n", "comments_disabled 0\n", "ratings_disabled 0\n", "video_error_or_removed 0\n", "description 570\n", "text 0\n", "dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bPCTgXhoc7wJ", "outputId": "7f2f55bb-0e3f-499f-f5d4-2e97e39925c4" }, "source": [ "from nltk.corpus import stopwords\n", "nltk.download('wordnet') \n", "\n", "from nltk.stem import WordNetLemmatizer\n", "wordnet_lemmatizer = WordNetLemmatizer()\n", "\n", "def process(line):\n", " return([wordnet_lemmatizer.lemmatize(t) for t in tokenizer.tokenize(line) if t not in en_stops])\n", " \n", "\n", "tokenizer = RegexpTokenizer(r'\\w+')\n", "df['tokens']=df['text'].str.lower().apply(process)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/wordnet.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "q049P2LVdHUR", "outputId": "4606b62d-5617-474e-fbe7-38784b9a5650" }, "source": [ "df.info()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "\n", "RangeIndex: 40949 entries, 0 to 40948\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 video_id 40949 non-null object\n", " 1 trending_date 40949 non-null object\n", " 2 title 40949 non-null object\n", " 3 channel_title 40949 non-null object\n", " 4 category_id 40949 non-null int64 \n", " 5 publish_time 40949 non-null object\n", " 6 tags 40949 non-null object\n", " 7 views 40949 non-null int64 \n", " 8 likes 40949 non-null int64 \n", " 9 dislikes 40949 non-null int64 \n", " 10 comment_count 40949 non-null int64 \n", " 11 thumbnail_link 40949 non-null object\n", " 12 comments_disabled 40949 non-null bool \n", " 13 ratings_disabled 40949 non-null bool \n", " 14 video_error_or_removed 40949 non-null bool \n", " 15 description 40379 non-null object\n", " 16 text 40949 non-null object\n", " 17 tokens 40949 non-null object\n", "dtypes: bool(3), int64(5), object(10)\n", "memory usage: 4.8+ MB\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "bzpR0BsSdS9Q", "colab": { "base_uri": "https://localhost:8080/", "height": 439 }, "outputId": "790275cc-8928-4796-e1f0-2462e0eaa660" }, "source": [ "from nltk import FreqDist\n", "\n", "vectors = pd.DataFrame()\n", "for row in df.head(250)['tokens']:\n", " vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)\n", "\n", "vectors" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
marriageshantellmartincaseyneistattalkwantdonaldhbojohnlastlastweektonightoliverpresidencytonighttrumpweekalessoanittaanwarbachbearblackbrazildrivergettinghannahstockinginannaiphonekingleleleleponslicenselovemancusomusicofficialpineappleponsponsracistpooracistrudy...sacredbezosceocommunityfestivalgatheringgrowingla17rarerichestsuccesssummitsummit17taylorreputationswifttargettaylorswifttaylurkingcantdunkfestfuckinguardreedsayingwillie100breathcanbreathholdholdingwatchcutearlgreylinecupcakesmacaronsmakridespastrypipescranscranlinevanilla
01.01.01.01.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaNNaNNaN2.01.02.03.01.02.02.03.05.03.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.01.01.02.01.01.01.01.01.01.01.01.02.02.01.01.01.04.01.01.01.01.01.01.02.04.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2.0NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
......................................................................................................................................................................................................................................................
245NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.0NaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
246NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.03.01.02.02.01.0NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
247NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
248NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.01.01.03.01.01.01.03.01.02.0
249NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

250 rows × 3135 columns

\n", "
" ], "text/plain": [ " marriageshantell martincaseyneistat talk ... scran scranline vanilla\n", "0 1.0 1.0 1.0 ... NaN NaN NaN\n", "1 NaN NaN NaN ... NaN NaN NaN\n", "2 NaN NaN NaN ... NaN NaN NaN\n", "3 NaN NaN NaN ... NaN NaN NaN\n", "4 NaN NaN NaN ... NaN NaN NaN\n", ".. ... ... ... ... ... ... ...\n", "245 NaN NaN NaN ... NaN NaN NaN\n", "246 NaN NaN NaN ... NaN NaN NaN\n", "247 NaN NaN NaN ... NaN NaN NaN\n", "248 NaN NaN NaN ... 3.0 1.0 2.0\n", "249 NaN NaN NaN ... NaN NaN NaN\n", "\n", "[250 rows x 3135 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qv97TQyi_lDu", "outputId": "8ee47fb9-82a5-4d03-bd52-8d1f3a6553ee" }, "source": [ "vectors.count()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "marriageshantell 2\n", "martincaseyneistat 2\n", "talk 19\n", "want 5\n", "donald 3\n", " ..\n", "pastry 1\n", "pipe 1\n", "scran 1\n", "scranline 1\n", "vanilla 1\n", "Length: 3135, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "code", "metadata": { "id": "kcxxEtnj-5uC" }, "source": [ "from math import log\n", "\n", "log_tf = pd.DataFrame()\n", "for c in vectors.columns:\n", " log_tf[c]=vectors[c].apply(lambda x: 1+log(x) if x>0 else 0)\n", "\n", "count = vectors.count().iloc[0]\n", "data = pd.DataFrame()\n", "for c in vectors.columns:\n", " data[c]=vectors[vectors[c]>0].count().apply(lambda x: 1+log(count/x) if x>0 else 0)\n", "data = data.iloc[0]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "id": "LkwytAkt_5h9", "outputId": "03e4aef2-3c31-4d3a-b7a9-0360cb8f9445" }, "source": [ "data.hist()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 15 }, { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAS9UlEQVR4nO3dcaxe9X3f8fcnOKQsZIGU9soz3sxUR6tTVMKuCFWm7VJWMFSKU62LjGhxUjRXLUzthqaR7g+yMKREmxMpiNI6wopT0TisbWYr9YZcyhXKNBNMQzGGUW7BKfZcvMbE7Q0rm7Pv/niO0yfevb7Pvc9zn5vb3/slXd1zfud3zvl97zWf59zfOc9DqgpJUhvestIDkCSNj6EvSQ0x9CWpIYa+JDXE0JekhqxZ6QGcz2WXXVYbNmxY8v7f+ta3ePvb3z66Aa0CrdXcWr1gza0Ypuann376z6rqB+ba9j0d+hs2bODQoUNL3n96epqpqanRDWgVaK3m1uoFa27FMDUn+fp825zekaSGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0JekhnxPvyN3WIePn+bDd//u2M979BM/OfZzStIgvNKXpIYY+pLUEENfkhpi6EtSQxYM/STfl+SrSf4wyZEk/7ZrvyLJk0lmknwxyYVd+9u69Zlu+4a+Y320a38xyY3LVZQkaW6DXOm/Cfx4Vf0ocBWwOcm1wCeBT1fVDwGvA7d3/W8HXu/aP931I8kmYCvwHmAz8KtJLhhlMZKk81sw9Ktntlt9a/dVwI8Dv9W17wY+2C1v6dbptl+fJF37nqp6s6peAWaAa0ZShSRpIAM9p99dkT8N/BDwAPDHwDer6kzX5RiwrlteB7wKUFVnkpwGvr9rP9h32P59+s+1HdgOMDExwfT09OIq6jNxEdx15ZmFO47YMGMe1uzs7Iqef9xaqxesuRXLVfNAoV9V3wauSnIJ8CXg7418JH91rp3AToDJycka5n+Rdv/De9lxePzvPzt669TYz3lWa/9budbqBWtuxXLVvKind6rqm8DjwI8BlyQ5m6iXA8e75ePAeoBu+zuBb/S3z7GPJGkMBnl65we6K3ySXAT8BPACvfD/6a7bNmBvt7yvW6fb/vtVVV371u7pniuAjcBXR1WIJGlhg8x9rAV2d/P6bwEeqaovJ3ke2JPk3wFfAx7q+j8E/EaSGeAUvSd2qKojSR4BngfOAHd000aSpDFZMPSr6lngvXO0v8wcT99U1V8C/3SeY90H3Lf4YUqSRsF35EpSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhqyYOgnWZ/k8STPJzmS5Je69o8lOZ7kme7r5r59PppkJsmLSW7sa9/ctc0kuXt5SpIkzWfNAH3OAHdV1R8keQfwdJID3bZPV9V/6O+cZBOwFXgP8LeA30vy7m7zA8BPAMeAp5Lsq6rnR1GIJGlhC4Z+VZ0ATnTLf5HkBWDdeXbZAuypqjeBV5LMANd022aq6mWAJHu6voa+JI3JIFf635FkA/Be4Eng/cCdSW4DDtH7a+B1ei8IB/t2O8ZfvUi8ek77++Y4x3ZgO8DExATT09OLGeJ3mbgI7rryzJL3X6phxjys2dnZFT3/uLVWL1hzK5ar5oFDP8nFwG8Dv1xVf57kQeBeoLrvO4CfG3ZAVbUT2AkwOTlZU1NTSz7W/Q/vZcfhRb2ujcTRW6fGfs6zpqenGeZnttq0Vi9YcyuWq+aBEjHJW+kF/sNV9TsAVfVa3/bPAl/uVo8D6/t2v7xr4zztkqQxGOTpnQAPAS9U1af62tf2dfsp4LlueR+wNcnbklwBbAS+CjwFbExyRZIL6d3s3TeaMiRJgxjkSv/9wM8Ch5M807X9CnBLkqvoTe8cBX4eoKqOJHmE3g3aM8AdVfVtgCR3Ao8CFwC7qurICGuRJC1gkKd3vgJkjk37z7PPfcB9c7TvP99+kqTl5TtyJakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDVkw9JOsT/J4kueTHEnyS137u5IcSPJS9/3Srj1JPpNkJsmzSa7uO9a2rv9LSbYtX1mSpLkMcqV/BrirqjYB1wJ3JNkE3A08VlUbgce6dYCbgI3d13bgQei9SAD3AO8DrgHuOftCIUkajwVDv6pOVNUfdMt/AbwArAO2ALu7bruBD3bLW4DPV89B4JIka4EbgQNVdaqqXgcOAJtHWo0k6bwWNaefZAPwXuBJYKKqTnSb/hSY6JbXAa/27Xasa5uvXZI0JmsG7ZjkYuC3gV+uqj9P8p1tVVVJahQDSrKd3rQQExMTTE9PL/lYExfBXVeeGcWwFmWYMQ9rdnZ2Rc8/bq3VC9bciuWqeaDQT/JWeoH/cFX9Ttf8WpK1VXWim7452bUfB9b37X5513YcmDqnffrcc1XVTmAnwOTkZE1NTZ3bZWD3P7yXHYcHfl0bmaO3To39nGdNT08zzM9stWmtXrDmVixXzYM8vRPgIeCFqvpU36Z9wNkncLYBe/vab+ue4rkWON1NAz0K3JDk0u4G7g1dmyRpTAa5DH4/8LPA4STPdG2/AnwCeCTJ7cDXgQ912/YDNwMzwBvARwCq6lSSe4Gnun4fr6pTI6lCkjSQBUO/qr4CZJ7N18/Rv4A75jnWLmDXYgYoSRod35ErSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyIKhn2RXkpNJnutr+1iS40me6b5u7tv20SQzSV5McmNf++aubSbJ3aMvRZK0kEGu9D8HbJ6j/dNVdVX3tR8gySZgK/Cebp9fTXJBkguAB4CbgE3ALV1fSdIYrVmoQ1U9kWTDgMfbAuypqjeBV5LMANd022aq6mWAJHu6vs8vesSSpCUbZk7/ziTPdtM/l3Zt64BX+/oc69rma5ckjdGCV/rzeBC4F6ju+w7g50YxoCTbge0AExMTTE9PL/lYExfBXVeeGcWwFmWYMQ9rdnZ2Rc8/bq3VC9bciuWqeUmhX1WvnV1O8lngy93qcWB9X9fLuzbO037usXcCOwEmJydrampqKUME4P6H97Lj8FJf15bu6K1TYz/nWdPT0wzzM1ttWqsXrLkVy1XzkqZ3kqztW/0p4OyTPfuArUneluQKYCPwVeApYGOSK5JcSO9m776lD1uStBQLXgYn+QIwBVyW5BhwDzCV5Cp60ztHgZ8HqKojSR6hd4P2DHBHVX27O86dwKPABcCuqjoy8mokSec1yNM7t8zR/NB5+t8H3DdH+35g/6JGJ0kaKd+RK0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JasiCoZ9kV5KTSZ7ra3tXkgNJXuq+X9q1J8lnkswkeTbJ1X37bOv6v5Rk2/KUI0k6n0Gu9D8HbD6n7W7gsaraCDzWrQPcBGzsvrYDD0LvRQK4B3gfcA1wz9kXCknS+CwY+lX1BHDqnOYtwO5ueTfwwb72z1fPQeCSJGuBG4EDVXWqql4HDvD/v5BIkpbZUuf0J6rqRLf8p8BEt7wOeLWv37Gubb52SdIYrRn2AFVVSWoUgwFIsp3e1BATExNMT08v+VgTF8FdV54Z0cgGN8yYhzU7O7ui5x+31uoFa27FctW81NB/LcnaqjrRTd+c7NqPA+v7+l3etR0Hps5pn57rwFW1E9gJMDk5WVNTU3N1G8j9D+9lx+GhX9cW7eitU2M/51nT09MM8zNbbVqrF6y5FctV81Knd/YBZ5/A2Qbs7Wu/rXuK51rgdDcN9ChwQ5JLuxu4N3RtkqQxWvAyOMkX6F2lX5bkGL2ncD4BPJLkduDrwIe67vuBm4EZ4A3gIwBVdSrJvcBTXb+PV9W5N4clSctswdCvqlvm2XT9HH0LuGOe4+wCdi1qdJKkkfIduZLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUkKFCP8nRJIeTPJPkUNf2riQHkrzUfb+0a0+SzySZSfJskqtHUYAkaXCjuNK/rqquqqrJbv1u4LGq2gg81q0D3ARs7L62Aw+O4NySpEVYjumdLcDubnk38MG+9s9Xz0HgkiRrl+H8kqR5pKqWvnPyCvA6UMCvV9XOJN+sqku67QFer6pLknwZ+ERVfaXb9hjwr6vq0DnH3E7vLwEmJib+/p49e5Y8vpOnTvPa/1ry7kt25bp3jv+kndnZWS6++OIVO/+4tVYvWHMrhqn5uuuue7pv9uW7rBlqVPAPqup4kh8EDiT57/0bq6qSLOpVpap2AjsBJicna2pqasmDu//hvew4PGyJi3f01qmxn/Os6elphvmZrTat1QvW3Irlqnmo6Z2qOt59Pwl8CbgGeO3stE33/WTX/Tiwvm/3y7s2SdKYLDn0k7w9yTvOLgM3AM8B+4BtXbdtwN5ueR9wW/cUz7XA6ao6seSRS5IWbZi5jwngS71pe9YAv1lV/yXJU8AjSW4Hvg58qOu/H7gZmAHeAD4yxLklSUuw5NCvqpeBH52j/RvA9XO0F3DHUs8nSRqe78iVpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1ZOyhn2RzkheTzCS5e9znl6SWjTX0k1wAPADcBGwCbkmyaZxjkKSWjftK/xpgpqperqr/DewBtox5DJLUrDVjPt864NW+9WPA+/o7JNkObO9WZ5O8OMT5LgP+bIj9lySfHPcZv8uK1LyCWqsXrLkVw9T8d+bbMO7QX1BV7QR2juJYSQ5V1eQojrVatFZza/WCNbdiuWoe9/TOcWB93/rlXZskaQzGHfpPARuTXJHkQmArsG/MY5CkZo11eqeqziS5E3gUuADYVVVHlvGUI5kmWmVaq7m1esGaW7EsNaeqluO4kqTvQb4jV5IaYuhLUkNWfegv9LEOSd6W5Ivd9ieTbBj/KEdrgJr/ZZLnkzyb5LEk8z6zu1oM+vEdSf5Jkkqy6h/vG6TmJB/qftdHkvzmuMc4agP82/7bSR5P8rXu3/fNKzHOUUmyK8nJJM/Nsz1JPtP9PJ5NcvXQJ62qVftF72bwHwN/F7gQ+ENg0zl9fhH4tW55K/DFlR73GGq+Dvgb3fIvtFBz1+8dwBPAQWBypcc9ht/zRuBrwKXd+g+u9LjHUPNO4Be65U3A0ZUe95A1/0PgauC5ebbfDPxnIMC1wJPDnnO1X+kP8rEOW4Dd3fJvAdcnyRjHOGoL1lxVj1fVG93qQXrvh1jNBv34jnuBTwJ/Oc7BLZNBav5nwANV9TpAVZ0c8xhHbZCaC/ib3fI7gf8xxvGNXFU9AZw6T5ctwOer5yBwSZK1w5xztYf+XB/rsG6+PlV1BjgNfP9YRrc8Bqm53+30rhRWswVr7v7sXV9VvzvOgS2jQX7P7wbeneS/JjmYZPPYRrc8Bqn5Y8DPJDkG7Af++XiGtmIW+9/7gr7nPoZBo5PkZ4BJ4B+t9FiWU5K3AJ8CPrzCQxm3NfSmeKbo/TX3RJIrq+qbKzqq5XUL8Lmq2pHkx4DfSPIjVfV/V3pgq8Vqv9If5GMdvtMnyRp6fxJ+YyyjWx4DfZRFkn8M/BvgA1X15pjGtlwWqvkdwI8A00mO0pv73LfKb+YO8ns+Buyrqv9TVa8Af0TvRWC1GqTm24FHAKrqvwHfR++Dyf66GvlH16z20B/kYx32Adu65Z8Gfr+6OySr1II1J3kv8Ov0An+1z/PCAjVX1emquqyqNlTVBnr3MT5QVYdWZrgjMci/7f9E7yqfJJfRm+55eZyDHLFBav4T4HqAJD9ML/T/51hHOV77gNu6p3iuBU5X1YlhDriqp3dqno91SPJx4FBV7QMeovcn4Ay9GyZbV27Ewxuw5n8PXAz8x+6e9Z9U1QdWbNBDGrDmv1YGrPlR4IYkzwPfBv5VVa3av2IHrPku4LNJ/gW9m7ofXs0XcUm+QO+F+7LuPsU9wFsBqurX6N23uBmYAd4APjL0OVfxz0uStEirfXpHkrQIhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyP8Dvlr/cjC7XHoAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "id": "-x92m5SxAtyi", "outputId": "9917e1eb-5855-4b00-d703-dda07267e1d5" }, "source": [ "tf_count = pd.DataFrame()\n", "for c in vectors.columns:\n", " tf_count=vectors[vectors[c]>0].sum()\n", "tf_count.hist()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 16 }, { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAR3klEQVR4nO3dcYxd5Xnn8e+vQFJE0oWU7MhrWzVSvbsiRSXRiLBKtZpNFDCkKlTajUBs4qZI7h8gJVpLK9J/aJsipdKSrIJSJHexSna9oahJZCtBpV7KVRRpCeCUYAylTIkjbBGsFkIyiTYrp0//mHfSWzr2zNx750487/cjXd1zn/Oec95nEL97OPfcS6oKSVIffmajJyBJmh5DX5I6YuhLUkcMfUnqiKEvSR05f6MncDaXXnpp7dixY+Ttf/CDH3DRRRdNbkLngN567q1fsOdejNPzkSNH/raq3r7cup/q0N+xYwdPPvnkyNsPBgPm5uYmN6FzQG8999Yv2HMvxuk5ybfPtM7LO5LUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JGf6m/kjuvoydf5jTu+MvXjHv/kB6Z+TElaDc/0Jakjhr4kdcTQl6SOrBj6SX42yeNJvpnkWJLfbfXLknw9yXySP0nyplZ/c3s939bvGNrXx1v9+STXrldTkqTlreZM/0fAe6vql4ErgV1Jrgb+APh0Vf0i8Bpwaxt/K/Baq3+6jSPJ5cBNwDuAXcAfJjlvks1Iks5uxdCvRQvt5QXtUcB7gT9t9fuBG9vyDe01bf37kqTVH6iqH1XVt4B54KqJdCFJWpVV3bLZzsiPAL8IfBb4G+C7VXW6DTkBbG3LW4GXAKrqdJLXgZ9v9ceGdju8zfCx9gB7AGZmZhgMBmvraMjMhbD3itMrD5ywceY8roWFhQ09/rT11i/Ycy/Wq+dVhX5V/Ri4MsnFwJeAfzvxmfzjsfYB+wBmZ2drnP9bzj0HDnL30el/FeH4LXNTP+aS3v4PQ731C/bci/XqeU1371TVd4FHgX8HXJxkKVG3ASfb8klgO0Bb/y+AvxuuL7ONJGkKVnP3ztvbGT5JLgTeDzzHYvj/xzZsN3CwLR9qr2nr/6KqqtVvanf3XAbsBB6fVCOSpJWt5trHFuD+dl3/Z4AHq+rLSZ4FHkjy+8BfAve18fcB/zPJPPAqi3fsUFXHkjwIPAucBm5rl40kSVOyYuhX1dPAO5epv8gyd99U1f8D/tMZ9nUXcNfapylJmgS/kStJHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SerIiqGfZHuSR5M8m+RYko+2+u8kOZnkqfa4fmibjyeZT/J8kmuH6rtabT7JHevTkiTpTM5fxZjTwN6q+kaStwJHkhxu6z5dVf9teHCSy4GbgHcA/wr4P0n+dVv9WeD9wAngiSSHqurZSTQiSVrZiqFfVS8DL7fl7yd5Dth6lk1uAB6oqh8B30oyD1zV1s1X1YsASR5oYw19SZqSNV3TT7IDeCfw9Va6PcnTSfYnuaTVtgIvDW12otXOVJckTclqLu8AkOQtwBeAj1XV95LcC3wCqPZ8N/Cb404oyR5gD8DMzAyDwWDkfc1cCHuvOD3ulNZsnDmPa2FhYUOPP2299Qv23Iv16nlVoZ/kAhYD/0BVfRGgql4ZWv9HwJfby5PA9qHNt7UaZ6n/RFXtA/YBzM7O1tzc3GqmuKx7Dhzk7qOrfl+bmOO3zE39mEsGgwHj/M3ONb31C/bci/XqeTV37wS4D3iuqj41VN8yNOzXgWfa8iHgpiRvTnIZsBN4HHgC2JnksiRvYvHD3kOTaUOStBqrOQ1+D/Ah4GiSp1rtt4Gbk1zJ4uWd48BvAVTVsSQPsvgB7Wngtqr6MUCS24GHgfOA/VV1bIK9SJJWsJq7d74GZJlVD51lm7uAu5apP3S27SRJ68tv5EpSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR1YM/STbkzya5Nkkx5J8tNXfluRwkhfa8yWtniSfSTKf5Okk7xra1+42/oUku9evLUnSclZzpn8a2FtVlwNXA7cluRy4A3ikqnYCj7TXANcBO9tjD3AvLL5JAHcC7wauAu5ceqOQJE3HiqFfVS9X1Tfa8veB54CtwA3A/W3Y/cCNbfkG4HO16DHg4iRbgGuBw1X1alW9BhwGdk20G0nSWZ2/lsFJdgDvBL4OzFTVy23Vd4CZtrwVeGlosxOtdqb6G4+xh8X/QmBmZobBYLCWKf4TMxfC3itOj7z9qMaZ87gWFhY29PjT1lu/YM+9WK+eVx36Sd4CfAH4WFV9L8lP1lVVJalJTKiq9gH7AGZnZ2tubm7kfd1z4CB3H13T+9pEHL9lburHXDIYDBjnb3au6a1fsOderFfPq7p7J8kFLAb+gar6Yiu/0i7b0J5PtfpJYPvQ5tta7Ux1SdKUrObunQD3Ac9V1aeGVh0Clu7A2Q0cHKp/uN3FczXwersM9DBwTZJL2ge417SaJGlKVnPt4z3Ah4CjSZ5qtd8GPgk8mORW4NvAB9u6h4DrgXngh8BHAKrq1SSfAJ5o436vql6dSBeSpFVZMfSr6mtAzrD6fcuML+C2M+xrP7B/LROUJE2O38iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1ZMXQT7I/yakkzwzVfifJySRPtcf1Q+s+nmQ+yfNJrh2q72q1+SR3TL4VSdJKVnOm/8fArmXqn66qK9vjIYAklwM3Ae9o2/xhkvOSnAd8FrgOuBy4uY2VJE3R+SsNqKqvJtmxyv3dADxQVT8CvpVkHriqrZuvqhcBkjzQxj675hlLkka2Yuifxe1JPgw8CeytqteArcBjQ2NOtBrAS2+ov3u5nSbZA+wBmJmZYTAYjDzBmQth7xWnR95+VOPMeVwLCwsbevxp661fsOderFfPo4b+vcAngGrPdwO/OYkJVdU+YB/A7Oxszc3Njbyvew4c5O6j47yvjeb4LXNTP+aSwWDAOH+zc01v/YI992K9eh4pEavqlaXlJH8EfLm9PAlsHxq6rdU4S12SNCUj3bKZZMvQy18Hlu7sOQTclOTNSS4DdgKPA08AO5NcluRNLH7Ye2j0aUuSRrHimX6SzwNzwKVJTgB3AnNJrmTx8s5x4LcAqupYkgdZ/ID2NHBbVf247ed24GHgPGB/VR2beDeSpLNazd07Ny9Tvu8s4+8C7lqm/hDw0JpmJ0maKL+RK0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6siKoZ9kf5JTSZ4Zqr0tyeEkL7TnS1o9ST6TZD7J00neNbTN7jb+hSS716cdSdLZrOZM/4+BXW+o3QE8UlU7gUfaa4DrgJ3tsQe4FxbfJIA7gXcDVwF3Lr1RSJKmZ8XQr6qvAq++oXwDcH9bvh+4caj+uVr0GHBxki3AtcDhqnq1ql4DDvPP30gkSevs/BG3m6mql9vyd4CZtrwVeGlo3IlWO1P9n0myh8X/SmBmZobBYDDiFGHmQth7xemRtx/VOHMe18LCwoYef9p66xfsuRfr1fOoof8TVVVJahKTafvbB+wDmJ2drbm5uZH3dc+Bg9x9dOwW1+z4LXNTP+aSwWDAOH+zc01v/YI992K9eh717p1X2mUb2vOpVj8JbB8at63VzlSXJE3RqKF/CFi6A2c3cHCo/uF2F8/VwOvtMtDDwDVJLmkf4F7TapKkKVrx2keSzwNzwKVJTrB4F84ngQeT3Ap8G/hgG/4QcD0wD/wQ+AhAVb2a5BPAE23c71XVGz8cliStsxVDv6puPsOq9y0ztoDbzrCf/cD+Nc1OkjRRfiNXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkbFCP8nxJEeTPJXkyVZ7W5LDSV5oz5e0epJ8Jsl8kqeTvGsSDUiSVm8SZ/r/oaqurKrZ9voO4JGq2gk80l4DXAfsbI89wL0TOLYkaQ3W4/LODcD9bfl+4Mah+udq0WPAxUm2rMPxJUlnMG7oF/DnSY4k2dNqM1X1clv+DjDTlrcCLw1te6LVJElTcv6Y2/9KVZ1M8i+Bw0n+anhlVVWSWssO25vHHoCZmRkGg8HIk5u5EPZecXrk7Uc1zpzHtbCwsKHHn7be+gV77sV69TxW6FfVyfZ8KsmXgKuAV5JsqaqX2+WbU234SWD70ObbWu2N+9wH7AOYnZ2tubm5ked3z4GD3H103Pe1tTt+y9zUj7lkMBgwzt/sXNNbv2DPvVivnke+vJPkoiRvXVoGrgGeAQ4Bu9uw3cDBtnwI+HC7i+dq4PWhy0CSpCkY5zR4BvhSkqX9/O+q+rMkTwAPJrkV+DbwwTb+IeB6YB74IfCRMY4tSRrByKFfVS8Cv7xM/e+A9y1TL+C2UY8nSRqf38iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI5MPfST7EryfJL5JHdM+/iS1LOphn6S84DPAtcBlwM3J7l8mnOQpJ6dP+XjXQXMV9WLAEkeAG4Anp3yPDatoydf5zfu+MrUj3v8kx+Y+jElrd20Q38r8NLQ6xPAu4cHJNkD7GkvF5I8P8bxLgX+doztR5I/mPYR/4neet6QfjeYPfdhnJ5/4Uwrph36K6qqfcC+SewryZNVNTuJfZ0reuu5t37BnnuxXj1P+4Pck8D2odfbWk2SNAXTDv0ngJ1JLkvyJuAm4NCU5yBJ3Zrq5Z2qOp3kduBh4Dxgf1UdW8dDTuQy0Tmmt5576xfsuRfr0nOqaj32K0n6KeQ3ciWpI4a+JHVkU4Z+bz/1kGR/klNJntnouUxLku1JHk3ybJJjST660XNab0l+NsnjSb7Zev7djZ7TNCQ5L8lfJvnyRs9lWpIcT3I0yVNJnpzovjfbNf32Uw9/DbyfxS9/PQHcXFWb9lu/Sf49sAB8rqp+aaPnMw1JtgBbquobSd4KHAFu3OT/nANcVFULSS4AvgZ8tKoe2+Cprask/wWYBX6uqn51o+czDUmOA7NVNfEvpG3GM/2f/NRDVf1/YOmnHjatqvoq8OpGz2OaqurlqvpGW/4+8ByL3/jetGrRQnt5QXtsrrO2N0iyDfgA8D82ei6bxWYM/eV+6mFTh0HvkuwA3gl8fWNnsv7apY6ngFPA4ara7D3/d+C/An+/0ROZsgL+PMmR9tM0E7MZQ18dSfIW4AvAx6rqexs9n/VWVT+uqitZ/Db7VUk27eW8JL8KnKqqIxs9lw3wK1X1LhZ/kfi2dgl3IjZj6PtTD51o17W/AByoqi9u9Hymqaq+CzwK7Nrouayj9wC/1q5vPwC8N8n/2tgpTUdVnWzPp4AvsXjZeiI2Y+j7Uw8daB9q3gc8V1Wf2uj5TEOStye5uC1fyOLNCn+1sbNaP1X18araVlU7WPz3+C+q6j9v8LTWXZKL2s0JJLkIuAaY2J15my70q+o0sPRTD88BD67zTz1suCSfB/4v8G+SnEhy60bPaQreA3yIxbO/p9rj+o2e1DrbAjya5GkWT24OV1U3tzF2ZAb4WpJvAo8DX6mqP5vUzjfdLZuSpDPbdGf6kqQzM/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR/4B7YmFZUWg8TsAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "id": "fBtA8AL0A7X_", "outputId": "9788036c-dafc-45f1-ac52-cc0b9508b03c" }, "source": [ "tf_weighed = pd.DataFrame()\n", "for c in log_tf.columns:\n", " tf_weighed[c] = log_tf[c]*data[c]\n", "\n", "for c in vectors.columns:\n", " tf_count=tf_weighed[tf_weighed[c]>0].sum()\n", "tf_count.hist()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 18 }, { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASkklEQVR4nO3df4xl5X3f8ffHrHGRcQw27oQu2y5SNqpwabAzwkRpm3Gw+VXJEDV1QTSsHaSNFJAchUrFyR+kpkikLaYydaxswirriGZDnFi7smnpZsPIslQwkFDWC6VM8FrsFrOKwSRjGqp1v/1jnrWuyezO3Zk7Z3b2eb+kq3vOc557z/c7l/3cM2fOvaSqkCT14S1rXYAkaTiGviR1xNCXpI4Y+pLUEUNfkjqyYa0LOJHzzjuvNm/evNZlnLTvfve7vP3tb1/rMgZlz32w5/XhySef/Iuqes9i207p0N+8eTNPPPHEWpdx0mZnZ5mZmVnrMgZlz32w5/UhyTePt83TO5LUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JFT+hO50qls/+HX+NjtXx58vwfv/qeD71OnD4/0Jakjhr4kdcTQl6SOGPqS1JElQz/J30rytST/I8mBJP+mjV+Y5LEkc0l+P8mZbfxtbX2ubd888lyfbOPPJblytZqSJC1unCP9N4CfrqofAy4BrkpyGfDrwL1V9SPAq8DNbf7NwKtt/N42jyQXAdcD7wWuAn4jyRmTbEaSdGJLhn4tmG+rb223An4a+EIb3wlc15avbeu07ZcnSRvfVVVvVNU3gDng0ol0IUkay1jn9JOckeQp4AiwF/hz4DtVdbRNOQRsbMsbgRcB2vbXgHePji/yGEnSAMb6cFZVfQ+4JMk5wBeBv79aBSXZBmwDmJqaYnZ2drV2tWrm5+fXZd0r0WPPU2fBbRcfXXrihK3lz7nH1/l06/mkPpFbVd9J8gjwE8A5STa0o/kLgMNt2mFgE3AoyQbgncC3R8aPGX3M6D62A9sBpqena739vylhff4/NVeqx57ve2A39+wf/kPtB2+cGXyfx/T4Op9uPY9z9c572hE+Sc4CPgw8CzwC/GybthXY3Zb3tHXa9j+pqmrj17erey4EtgBfm1QjkqSljXOYcj6ws11p8xbgwar6UpJngF1J/i3wZ8D9bf79wO8mmQNeYeGKHarqQJIHgWeAo8At7bSRJGkgS4Z+VT0NvG+R8RdY5Oqbqvpr4J8f57nuAu46+TIlSZPgJ3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdWTL0k2xK8kiSZ5IcSPKJNv5rSQ4neardrhl5zCeTzCV5LsmVI+NXtbG5JLevTkuSpOPZMMaco8BtVfWnSd4BPJlkb9t2b1X9h9HJSS4CrgfeC/wd4I+T/Gjb/Fngw8Ah4PEke6rqmUk0Ikla2pKhX1UvAS+15b9K8iyw8QQPuRbYVVVvAN9IMgdc2rbNVdULAEl2tbmGviQN5KTO6SfZDLwPeKwN3Zrk6SQ7kpzbxjYCL4487FAbO964JGkg45zeASDJ2cAfAr9UVX+Z5HPAnUC1+3uAn19pQUm2AdsApqammJ2dXelTDm5+fn5d1r0SPfY8dRbcdvHRwfe7lj/nHl/n063nsUI/yVtZCPwHquqPAKrq5ZHtvwV8qa0eBjaNPPyCNsYJxr+vqrYD2wGmp6drZmZmnBJPKbOzs6zHuleix57ve2A39+wf+7hpYg7eODP4Po/p8XU+3Xoe5+qdAPcDz1bVp0fGzx+Z9jPA19vyHuD6JG9LciGwBfga8DiwJcmFSc5k4Y+9eybThiRpHOMcpvwk8HPA/iRPtbFfAW5IcgkLp3cOAr8AUFUHkjzIwh9ojwK3VNX3AJLcCjwMnAHsqKoDE+xFkrSEca7e+SqQRTY9dILH3AXctcj4Qyd6nCRpdfmJXEnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqyJKhn2RTkkeSPJPkQJJPtPF3Jdmb5Pl2f24bT5LPJJlL8nSS948819Y2//kkW1evLUnSYsY50j8K3FZVFwGXAbckuQi4HdhXVVuAfW0d4GpgS7ttAz4HC28SwB3AB4BLgTuOvVFIkoaxZOhX1UtV9adt+a+AZ4GNwLXAzjZtJ3BdW74W+HwteBQ4J8n5wJXA3qp6papeBfYCV020G0nSCW04mclJNgPvAx4DpqrqpbbpW8BUW94IvDjysENt7Hjjb97HNhZ+Q2BqaorZ2dmTKfGUMD8/vy7rXokee546C267+Ojg+13Ln3OPr/Pp1vPYoZ/kbOAPgV+qqr9M8v1tVVVJahIFVdV2YDvA9PR0zczMTOJpBzU7O8t6rHsleuz5vgd2c8/+kzpumoiDN84Mvs9jenydT7eex7p6J8lbWQj8B6rqj9rwy+20De3+SBs/DGwaefgFbex445KkgYxz9U6A+4Fnq+rTI5v2AMeuwNkK7B4Zv6ldxXMZ8Fo7DfQwcEWSc9sfcK9oY5KkgYzzu+lPAj8H7E/yVBv7FeBu4MEkNwPfBD7atj0EXAPMAa8DHweoqleS3Ak83uZ9qqpemUgXkqSxLBn6VfVVIMfZfPki8wu45TjPtQPYcTIFSpImx0/kSlJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOrJk6CfZkeRIkq+PjP1aksNJnmq3a0a2fTLJXJLnklw5Mn5VG5tLcvvkW5EkLWWcI/3fAa5aZPzeqrqk3R4CSHIRcD3w3vaY30hyRpIzgM8CVwMXATe0uZKkAW1YakJVfSXJ5jGf71pgV1W9AXwjyRxwads2V1UvACTZ1eY+c9IVS5KWbcnQP4Fbk9wEPAHcVlWvAhuBR0fmHGpjAC++afwDiz1pkm3ANoCpqSlmZ2dXUOLamJ+fX5d1r0SPPU+dBbddfHTw/a7lz7nH1/l063m5of854E6g2v09wM9PoqCq2g5sB5ienq6ZmZlJPO2gZmdnWY91r0SPPd/3wG7u2b+S46blOXjjzOD7PKbH1/l063lZ/8VW1cvHlpP8FvCltnoY2DQy9YI2xgnGJUkDWdYlm0nOH1n9GeDYlT17gOuTvC3JhcAW4GvA48CWJBcmOZOFP/buWX7ZkqTlWPJIP8nvATPAeUkOAXcAM0kuYeH0zkHgFwCq6kCSB1n4A+1R4Jaq+l57nluBh4EzgB1VdWDi3UiSTmicq3duWGT4/hPMvwu4a5Hxh4CHTqo6SdJE+YlcSeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR5YM/SQ7khxJ8vWRsXcl2Zvk+XZ/bhtPks8kmUvydJL3jzxma5v/fJKtq9OOJOlExjnS/x3gqjeN3Q7sq6otwL62DnA1sKXdtgGfg4U3CeAO4APApcAdx94oJEnDWTL0q+orwCtvGr4W2NmWdwLXjYx/vhY8CpyT5HzgSmBvVb1SVa8Ce/mbbySSpFW2YZmPm6qql9ryt4CptrwReHFk3qE2drzxvyHJNhZ+S2BqaorZ2dlllrh25ufn12XdK9Fjz1NnwW0XHx18v2v5c+7xdT7del5u6H9fVVWSmkQx7fm2A9sBpqena2ZmZlJPPZjZ2VnWY90r0WPP9z2wm3v2r/if0Ek7eOPM4Ps8psfX+XTreblX77zcTtvQ7o+08cPAppF5F7Sx441Lkga03NDfAxy7AmcrsHtk/KZ2Fc9lwGvtNNDDwBVJzm1/wL2ijUmSBrTk76ZJfg+YAc5LcoiFq3DuBh5McjPwTeCjbfpDwDXAHPA68HGAqnolyZ3A423ep6rqzX8cliStsiVDv6puOM6myxeZW8Atx3meHcCOk6pOkjRRfiJXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkRWFfpKDSfYneSrJE23sXUn2Jnm+3Z/bxpPkM0nmkjyd5P2TaECSNL5JHOl/sKouqarptn47sK+qtgD72jrA1cCWdtsGfG4C+5YknYTVOL1zLbCzLe8ErhsZ/3wteBQ4J8n5q7B/SdJxpKqW/+DkG8CrQAG/WVXbk3ynqs5p2wO8WlXnJPkScHdVfbVt2wf866p64k3PuY2F3wSYmpr68V27di27vrUyPz/P2WefvdZlDKrHno+88hov/5/h93vxxncOv9Omx9d5Pfb8wQ9+8MmRsy8/YMMKn/sfVdXhJH8b2Jvkf45urKpKclLvKlW1HdgOMD09XTMzMysscXizs7Osx7pXosee73tgN/fsX+k/oZN38MaZwfd5TI+v8+nW84pO71TV4XZ/BPgicCnw8rHTNu3+SJt+GNg08vAL2pgkaSDLDv0kb0/yjmPLwBXA14E9wNY2bSuwuy3vAW5qV/FcBrxWVS8tu3JJ0klbye+mU8AXF07bswH4z1X1X5M8DjyY5Gbgm8BH2/yHgGuAOeB14OMr2LckaRmWHfpV9QLwY4uMfxu4fJHxAm5Z7v4kSSvnJ3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdGTz0k1yV5Lkkc0luH3r/ktSzQUM/yRnAZ4GrgYuAG5JcNGQNktSzoY/0LwXmquqFqvq/wC7g2oFrkKRubRh4fxuBF0fWDwEfGJ2QZBuwra3OJ3luoNom6TzgL9a6iIHZ80Dy60Pv8Qf4Oq8Pf+94G4YO/SVV1XZg+1rXsRJJnqiq6bWuY0j23Ad7Xv+GPr1zGNg0sn5BG5MkDWDo0H8c2JLkwiRnAtcDewauQZK6Nejpnao6muRW4GHgDGBHVR0YsoaBrOvTU8tkz32w53UuVbXWNUiSBuInciWpI4a+JHXE0J+AJO9KsjfJ8+3+3BPM/aEkh5L8pyFrnLRxek5ySZL/nuRAkqeT/Iu1qHWllvrqkCRvS/L7bftjSTYPX+XkjNHvLyd5pr2m+5Ic95rw9WLcr4dJ8s+SVJJ1ewmnoT8ZtwP7qmoLsK+tH8+dwFcGqWp1jdPz68BNVfVe4CrgPyY5Z8AaV2zMrw65GXi1qn4EuBdY249PrcCY/f4ZMF1V/xD4AvDvhq1yssb9epgk7wA+ATw2bIWTZehPxrXAzra8E7husUlJfhyYAv7bQHWtpiV7rqr/VVXPt+X/DRwB3jNYhZMxzleHjP4svgBcniQD1jhJS/ZbVY9U1ett9VEWPm+zno379TB3svCG/tdDFjdphv5kTFXVS235WywE+w9I8hbgHuBfDVnYKlqy51FJLgXOBP58tQubsMW+OmTj8eZU1VHgNeDdg1Q3eeP0O+pm4L+sakWrb8mek7wf2FRVXx6ysNVwyn0Nw6kqyR8DP7zIpl8dXamqSrLYdbC/CDxUVYfWy0HgBHo+9jznA78LbK2q/zfZKrVWkvxLYBr4qbWuZTW1A7ZPAx9b41ImwtAfU1V96Hjbkryc5PyqeqkF3JFFpv0E8I+T/CJwNnBmkvmqOmX/nwIT6JkkPwR8GfjVqnp0lUpdTeN8dcixOYeSbADeCXx7mPImbqyvSknyIRbe/H+qqt4YqLbVslTP7wD+ATDbDth+GNiT5CNV9cRgVU6Ip3cmYw+wtS1vBXa/eUJV3VhVf7eqNrNwiufzp3Lgj2HJnttXbXyRhV6/MGBtkzTOV4eM/ix+FviTWr+felyy3yTvA34T+EhVLfpmv86csOeqeq2qzquqze3f76Ms9L7uAh8M/Um5G/hwkueBD7V1kkwn+e01rWz1jNPzR4F/AnwsyVPtdsnalLs87Rz9sa8OeRZ4sKoOJPlUko+0afcD704yB/wyJ75665Q2Zr//noXfVv+gvabr+vuzxuz5tOHXMEhSRzzSl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI/8fdczOZxEND9UAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 513 }, "id": "OzVHi0peDNFY", "outputId": "921fbade-d784-4bd8-e463-0870d24d7dcc" }, "source": [ "def most_frequent(List): \n", " counter = 0\n", " num = List[0] \n", " \n", " for i in List: \n", " curr_frequency = List.count(i) \n", " if(curr_frequency> counter): \n", " counter = curr_frequency \n", " num = i \n", " \n", " return num\n", " \n", "df['most_common'] = df['tokens'].apply(most_frequent)\n", "\n", "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
video_idtrending_datetitlechannel_titlecategory_idpublish_timetagsviewslikesdislikescomment_countthumbnail_linkcomments_disabledratings_disabledvideo_error_or_removeddescriptiontexttokensmost_common
02kyS6SvSYSE17.14.11WE WANT TO TALK ABOUT OUR MARRIAGECaseyNeistat222017-11-13T17:13:01.000ZSHANtell martin74837457527296615954https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpgFalseFalseFalseSHANTELL'S CHANNEL - https://www.youtube.com/s...WE WANT TO TALK ABOUT OUR MARRIAGESHANtell mar...[want, talk, marriageshantell, martincaseyneis...want
11ZAPwfrtAFY17.14.11The Trump Presidency: Last Week Tonight with J...LastWeekTonight242017-11-13T07:30:00.000Zlast week tonight trump presidency|\"last week ...241878397185614612703https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpgFalseFalseFalseOne year after the presidential election, John...The Trump Presidency: Last Week Tonight with J...[trump, presidency, last, week, tonight, john,...trump
25qpjK5DgCt417.14.11Racist Superman | Rudy Mancuso, King Bach & Le...Rudy Mancuso232017-11-12T19:05:24.000Zracist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"...319143414603353398181https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpgFalseFalseFalseWATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► http...Racist Superman | Rudy Mancuso, King Bach & Le...[racist, superman, rudy, mancuso, king, bach, ...rudy
3puqaWrEC7tY17.14.11Nickelback Lyrics: Real or Fake?Good Mythical Morning242017-11-13T11:00:04.000Zrhett and link|\"gmm\"|\"good mythical morning\"|\"...343168101726662146https://i.ytimg.com/vi/puqaWrEC7tY/default.jpgFalseFalseFalseToday we find out if Link is a Nickelback amat...Nickelback Lyrics: Real or Fake?rhett and link...[nickelback, lyric, real, fake, rhett, link, g...nickelback
4d380meD0W0M17.14.11I Dare You: GOING BALD!?nigahiga242017-11-12T18:01:41.000Zryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"...2095731132235198917518https://i.ytimg.com/vi/d380meD0W0M/default.jpgFalseFalseFalseI know it's been a while since we did this sho...I Dare You: GOING BALD!?ryan|\"higa\"|\"higatv\"|\"...[dare, going, bald, ryan, higa, higatv, nigahi...dare
\n", "
" ], "text/plain": [ " video_id ... most_common\n", "0 2kyS6SvSYSE ... want\n", "1 1ZAPwfrtAFY ... trump\n", "2 5qpjK5DgCt4 ... rudy\n", "3 puqaWrEC7tY ... nickelback\n", "4 d380meD0W0M ... dare\n", "\n", "[5 rows x 19 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "liU74o_DGYAR", "outputId": "924602e2-575d-4908-d877-6d1bf1d32df9" }, "source": [ "print(df.dtypes)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "video_id object\n", "trending_date object\n", "title object\n", "channel_title object\n", "category_id int64\n", "publish_time object\n", "tags object\n", "views int64\n", "likes int64\n", "dislikes int64\n", "comment_count int64\n", "thumbnail_link object\n", "comments_disabled bool\n", "ratings_disabled bool\n", "video_error_or_removed bool\n", "description object\n", "text object\n", "tokens object\n", "most_common object\n", "dtype: object\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JtvJsnOGGo8H", "outputId": "b06f679a-6816-42b2-872a-504832244000" }, "source": [ "from collections import Counter\n", "one = df['most_common'].to_list()\n", "\n", "rslt = pd.DataFrame(Counter(one).most_common(10),\n", " columns=['Word', 'Frequency']).set_index('Word')\n", "print(rslt)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " Frequency\n", "Word \n", "makeup 732\n", "late 342\n", "cat 316\n", "trailer 257\n", "show 219\n", "news 206\n", "movie 202\n", "react 188\n", "star 188\n", "food 165\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wPL-ckQVHiR1", "outputId": "2c746f52-850d-4206-b558-671a8f3fa178" }, "source": [ "def popular(List_1):\n", " first = rslt.index.to_list() \n", " if List_1 in first:\n", " return True\n", " else:\n", " return False \n", "\n", "df['popular_word'] = df['most_common'].apply(lambda d: popular(d))\n", "\n", "df['popular_word'].value_counts()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "False 38134\n", "True 2815\n", "Name: popular_word, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 23 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 513 }, "id": "m4eqJ3kBH6eW", "outputId": "e16bb1f5-180c-48e7-f103-c7b554de2f2e" }, "source": [ "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
video_idtrending_datetitlechannel_titlecategory_idpublish_timetagsviewslikesdislikescomment_countthumbnail_linkcomments_disabledratings_disabledvideo_error_or_removeddescriptiontexttokensmost_commonpopular_word
02kyS6SvSYSE17.14.11WE WANT TO TALK ABOUT OUR MARRIAGECaseyNeistat222017-11-13T17:13:01.000ZSHANtell martin74837457527296615954https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpgFalseFalseFalseSHANTELL'S CHANNEL - https://www.youtube.com/s...WE WANT TO TALK ABOUT OUR MARRIAGESHANtell mar...[want, talk, marriageshantell, martincaseyneis...wantFalse
11ZAPwfrtAFY17.14.11The Trump Presidency: Last Week Tonight with J...LastWeekTonight242017-11-13T07:30:00.000Zlast week tonight trump presidency|\"last week ...241878397185614612703https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpgFalseFalseFalseOne year after the presidential election, John...The Trump Presidency: Last Week Tonight with J...[trump, presidency, last, week, tonight, john,...trumpFalse
25qpjK5DgCt417.14.11Racist Superman | Rudy Mancuso, King Bach & Le...Rudy Mancuso232017-11-12T19:05:24.000Zracist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"...319143414603353398181https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpgFalseFalseFalseWATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► http...Racist Superman | Rudy Mancuso, King Bach & Le...[racist, superman, rudy, mancuso, king, bach, ...rudyFalse
3puqaWrEC7tY17.14.11Nickelback Lyrics: Real or Fake?Good Mythical Morning242017-11-13T11:00:04.000Zrhett and link|\"gmm\"|\"good mythical morning\"|\"...343168101726662146https://i.ytimg.com/vi/puqaWrEC7tY/default.jpgFalseFalseFalseToday we find out if Link is a Nickelback amat...Nickelback Lyrics: Real or Fake?rhett and link...[nickelback, lyric, real, fake, rhett, link, g...nickelbackFalse
4d380meD0W0M17.14.11I Dare You: GOING BALD!?nigahiga242017-11-12T18:01:41.000Zryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"...2095731132235198917518https://i.ytimg.com/vi/d380meD0W0M/default.jpgFalseFalseFalseI know it's been a while since we did this sho...I Dare You: GOING BALD!?ryan|\"higa\"|\"higatv\"|\"...[dare, going, bald, ryan, higa, higatv, nigahi...dareFalse
\n", "
" ], "text/plain": [ " video_id trending_date ... most_common popular_word\n", "0 2kyS6SvSYSE 17.14.11 ... want False\n", "1 1ZAPwfrtAFY 17.14.11 ... trump False\n", "2 5qpjK5DgCt4 17.14.11 ... rudy False\n", "3 puqaWrEC7tY 17.14.11 ... nickelback False\n", "4 d380meD0W0M 17.14.11 ... dare False\n", "\n", "[5 rows x 20 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 24 } ] }, { "cell_type": "code", "metadata": { "id": "_2jB31r5H-DN" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }