{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Week12_Assignment.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "5vfSwXmsxAMo", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "fe68b61c-5f6e-48ff-c524-3026dc54d539" }, "source": [ "from google.colab import drive\n", "drive.mount('/data/')\n", "data_dir = '/data/My Drive/Colab Notebooks/Experiment'\n", "!ls '/data/My Drive/Colab Notebooks/Experiment'\n", "!pip install matplotlib" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Mounted at /data/\n", "diamonds.csv m_data.csv TSLA.csv\t w_data.csv\n", "Iris.csv news_data.csv USvideos.csv\n", "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)\n", "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)\n", "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dkvvea3LO_S1", "outputId": "afa6c061-5ef1-481c-b689-8df472afee7d" }, "source": [ "import pandas as pd\n", "\n", "df = pd.read_csv(data_dir + '/USvideos.csv')\n", "print(df.shape)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "(40949, 16)\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "of76mCxFPDFq" }, "source": [ " df['text'] = df['title']+df['tags']+df['channel_title']" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "DJ7HlXeoWkqp", "outputId": "97b83fdb-a8dc-47b9-b853-8a1cdd30ad89" }, "source": [ "import nltk\n", "from nltk.stem import *\n", "nltk.download('punkt')\n", "from nltk.tokenize import RegexpTokenizer\n", "import pandas as pd\n", "from tqdm.notebook import tqdm\n", "tqdm.pandas()\n", "from functools import reduce\n", "import re" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package punkt to /root/nltk_data...\n", "[nltk_data] Unzipping tokenizers/punkt.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ro6ryJpWXmTd", "outputId": "b58c0543-c9ce-4f33-9894-c33e00987568" }, "source": [ "import nltk\n", "nltk.download('stopwords')\n", "\n", "from nltk.corpus import stopwords\n", "stopwords.words('english')\n", "\n", "en_stops = set(stopwords.words('english'))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/stopwords.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "-Ual81F5Xivs", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "cc04fdb9-a4f1-4625-e5f9-b013de0e6388" }, "source": [ "df.isnull().sum()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "video_id 0\n", "trending_date 0\n", "title 0\n", "channel_title 0\n", "category_id 0\n", "publish_time 0\n", "tags 0\n", "views 0\n", "likes 0\n", "dislikes 0\n", "comment_count 0\n", "thumbnail_link 0\n", "comments_disabled 0\n", "ratings_disabled 0\n", "video_error_or_removed 0\n", "description 570\n", "text 0\n", "dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 6 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bPCTgXhoc7wJ", "outputId": "7f2f55bb-0e3f-499f-f5d4-2e97e39925c4" }, "source": [ "from nltk.corpus import stopwords\n", "nltk.download('wordnet') \n", "\n", "from nltk.stem import WordNetLemmatizer\n", "wordnet_lemmatizer = WordNetLemmatizer()\n", "\n", "def process(line):\n", " return([wordnet_lemmatizer.lemmatize(t) for t in tokenizer.tokenize(line) if t not in en_stops])\n", " \n", "\n", "tokenizer = RegexpTokenizer(r'\\w+')\n", "df['tokens']=df['text'].str.lower().apply(process)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Unzipping corpora/wordnet.zip.\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "q049P2LVdHUR", "outputId": "4606b62d-5617-474e-fbe7-38784b9a5650" }, "source": [ "df.info()" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "<class 'pandas.core.frame.DataFrame'>\n", "RangeIndex: 40949 entries, 0 to 40948\n", "Data columns (total 18 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 video_id 40949 non-null object\n", " 1 trending_date 40949 non-null object\n", " 2 title 40949 non-null object\n", " 3 channel_title 40949 non-null object\n", " 4 category_id 40949 non-null int64 \n", " 5 publish_time 40949 non-null object\n", " 6 tags 40949 non-null object\n", " 7 views 40949 non-null int64 \n", " 8 likes 40949 non-null int64 \n", " 9 dislikes 40949 non-null int64 \n", " 10 comment_count 40949 non-null int64 \n", " 11 thumbnail_link 40949 non-null object\n", " 12 comments_disabled 40949 non-null bool \n", " 13 ratings_disabled 40949 non-null bool \n", " 14 video_error_or_removed 40949 non-null bool \n", " 15 description 40379 non-null object\n", " 16 text 40949 non-null object\n", " 17 tokens 40949 non-null object\n", "dtypes: bool(3), int64(5), object(10)\n", "memory usage: 4.8+ MB\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "bzpR0BsSdS9Q", "colab": { "base_uri": "https://localhost:8080/", "height": 439 }, "outputId": "790275cc-8928-4796-e1f0-2462e0eaa660" }, "source": [ "from nltk import FreqDist\n", "\n", "vectors = pd.DataFrame()\n", "for row in df.head(250)['tokens']:\n", " vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)\n", "\n", "vectors" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>marriageshantell</th>\n", " <th>martincaseyneistat</th>\n", " <th>talk</th>\n", " <th>want</th>\n", " <th>donald</th>\n", " <th>hbo</th>\n", " <th>john</th>\n", " <th>last</th>\n", " <th>lastweektonight</th>\n", " <th>oliver</th>\n", " <th>presidency</th>\n", " <th>tonight</th>\n", " <th>trump</th>\n", " <th>week</th>\n", " <th>alesso</th>\n", " <th>anitta</th>\n", " <th>anwar</th>\n", " <th>bach</th>\n", " <th>bear</th>\n", " <th>black</th>\n", " <th>brazil</th>\n", " <th>driver</th>\n", " <th>getting</th>\n", " <th>hannahstocking</th>\n", " <th>inanna</th>\n", " <th>iphone</th>\n", " <th>king</th>\n", " <th>lele</th>\n", " <th>lelepons</th>\n", " <th>license</th>\n", " <th>love</th>\n", " <th>mancuso</th>\n", " <th>music</th>\n", " <th>official</th>\n", " <th>pineapple</th>\n", " <th>pons</th>\n", " <th>ponsracist</th>\n", " <th>poo</th>\n", " <th>racist</th>\n", " <th>rudy</th>\n", " <th>...</th>\n", " <th>sacred</th>\n", " <th>bezos</th>\n", " <th>ceo</th>\n", " <th>community</th>\n", " <th>festival</th>\n", " <th>gathering</th>\n", " <th>growing</th>\n", " <th>la17</th>\n", " <th>rare</th>\n", " <th>richest</th>\n", " <th>successsummit</th>\n", " <th>summit</th>\n", " <th>17taylor</th>\n", " <th>reputationswift</th>\n", " <th>target</th>\n", " <th>taylorswift</th>\n", " <th>taylurking</th>\n", " <th>cant</th>\n", " <th>dunkfest</th>\n", " <th>fuckin</th>\n", " <th>guard</th>\n", " <th>reed</th>\n", " <th>saying</th>\n", " <th>willie</th>\n", " <th>100</th>\n", " <th>breath</th>\n", " <th>canbreath</th>\n", " <th>hold</th>\n", " <th>holding</th>\n", " <th>watchcut</th>\n", " <th>earl</th>\n", " <th>grey</th>\n", " <th>linecupcakes</th>\n", " <th>macarons</th>\n", " <th>makrides</th>\n", " <th>pastry</th>\n", " <th>pipe</th>\n", " <th>scran</th>\n", " <th>scranline</th>\n", " <th>vanilla</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>3.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>2.0</td>\n", " <td>3.0</td>\n", " <td>5.0</td>\n", " <td>3.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>4.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>4.0</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>2.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>245</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>246</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.0</td>\n", " <td>3.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " <td>2.0</td>\n", " <td>1.0</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>247</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>248</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>3.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>1.0</td>\n", " <td>3.0</td>\n", " <td>1.0</td>\n", " <td>2.0</td>\n", " </tr>\n", " <tr>\n", " <th>249</th>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>...</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>250 rows × 3135 columns</p>\n", "</div>" ], "text/plain": [ " marriageshantell martincaseyneistat talk ... scran scranline vanilla\n", "0 1.0 1.0 1.0 ... NaN NaN NaN\n", "1 NaN NaN NaN ... NaN NaN NaN\n", "2 NaN NaN NaN ... NaN NaN NaN\n", "3 NaN NaN NaN ... NaN NaN NaN\n", "4 NaN NaN NaN ... NaN NaN NaN\n", ".. ... ... ... ... ... ... ...\n", "245 NaN NaN NaN ... NaN NaN NaN\n", "246 NaN NaN NaN ... NaN NaN NaN\n", "247 NaN NaN NaN ... NaN NaN NaN\n", "248 NaN NaN NaN ... 3.0 1.0 2.0\n", "249 NaN NaN NaN ... NaN NaN NaN\n", "\n", "[250 rows x 3135 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qv97TQyi_lDu", "outputId": "8ee47fb9-82a5-4d03-bd52-8d1f3a6553ee" }, "source": [ "vectors.count()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "marriageshantell 2\n", "martincaseyneistat 2\n", "talk 19\n", "want 5\n", "donald 3\n", " ..\n", "pastry 1\n", "pipe 1\n", "scran 1\n", "scranline 1\n", "vanilla 1\n", "Length: 3135, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "code", "metadata": { "id": "kcxxEtnj-5uC" }, "source": [ "from math import log\n", "\n", "log_tf = pd.DataFrame()\n", "for c in vectors.columns:\n", " log_tf[c]=vectors[c].apply(lambda x: 1+log(x) if x>0 else 0)\n", "\n", "count = vectors.count().iloc[0]\n", "data = pd.DataFrame()\n", "for c in vectors.columns:\n", " data[c]=vectors[vectors[c]>0].count().apply(lambda x: 1+log(count/x) if x>0 else 0)\n", "data = data.iloc[0]" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "id": "LkwytAkt_5h9", "outputId": "03e4aef2-3c31-4d3a-b7a9-0360cb8f9445" }, "source": [ "data.hist()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<matplotlib.axes._subplots.AxesSubplot at 0x7f9508b69da0>" ] }, "metadata": { "tags": [] }, "execution_count": 15 }, { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAS9UlEQVR4nO3dcaxe9X3f8fcnOKQsZIGU9soz3sxUR6tTVMKuCFWm7VJWMFSKU62LjGhxUjRXLUzthqaR7g+yMKREmxMpiNI6wopT0TisbWYr9YZcyhXKNBNMQzGGUW7BKfZcvMbE7Q0rm7Pv/niO0yfevb7Pvc9zn5vb3/slXd1zfud3zvl97zWf59zfOc9DqgpJUhvestIDkCSNj6EvSQ0x9CWpIYa+JDXE0JekhqxZ6QGcz2WXXVYbNmxY8v7f+ta3ePvb3z66Aa0CrdXcWr1gza0Ypuann376z6rqB+ba9j0d+hs2bODQoUNL3n96epqpqanRDWgVaK3m1uoFa27FMDUn+fp825zekaSGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0JekhnxPvyN3WIePn+bDd//u2M979BM/OfZzStIgvNKXpIYY+pLUEENfkhpi6EtSQxYM/STfl+SrSf4wyZEk/7ZrvyLJk0lmknwxyYVd+9u69Zlu+4a+Y320a38xyY3LVZQkaW6DXOm/Cfx4Vf0ocBWwOcm1wCeBT1fVDwGvA7d3/W8HXu/aP931I8kmYCvwHmAz8KtJLhhlMZKk81sw9Ktntlt9a/dVwI8Dv9W17wY+2C1v6dbptl+fJF37nqp6s6peAWaAa0ZShSRpIAM9p99dkT8N/BDwAPDHwDer6kzX5RiwrlteB7wKUFVnkpwGvr9rP9h32P59+s+1HdgOMDExwfT09OIq6jNxEdx15ZmFO47YMGMe1uzs7Iqef9xaqxesuRXLVfNAoV9V3wauSnIJ8CXg7418JH91rp3AToDJycka5n+Rdv/De9lxePzvPzt669TYz3lWa/9budbqBWtuxXLVvKind6rqm8DjwI8BlyQ5m6iXA8e75ePAeoBu+zuBb/S3z7GPJGkMBnl65we6K3ySXAT8BPACvfD/6a7bNmBvt7yvW6fb/vtVVV371u7pniuAjcBXR1WIJGlhg8x9rAV2d/P6bwEeqaovJ3ke2JPk3wFfAx7q+j8E/EaSGeAUvSd2qKojSR4BngfOAHd000aSpDFZMPSr6lngvXO0v8wcT99U1V8C/3SeY90H3Lf4YUqSRsF35EpSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhqyYOgnWZ/k8STPJzmS5Je69o8lOZ7kme7r5r59PppkJsmLSW7sa9/ctc0kuXt5SpIkzWfNAH3OAHdV1R8keQfwdJID3bZPV9V/6O+cZBOwFXgP8LeA30vy7m7zA8BPAMeAp5Lsq6rnR1GIJGlhC4Z+VZ0ATnTLf5HkBWDdeXbZAuypqjeBV5LMANd022aq6mWAJHu6voa+JI3JIFf635FkA/Be4Eng/cCdSW4DDtH7a+B1ei8IB/t2O8ZfvUi8ek77++Y4x3ZgO8DExATT09OLGeJ3mbgI7rryzJL3X6phxjys2dnZFT3/uLVWL1hzK5ar5oFDP8nFwG8Dv1xVf57kQeBeoLrvO4CfG3ZAVbUT2AkwOTlZU1NTSz7W/Q/vZcfhRb2ujcTRW6fGfs6zpqenGeZnttq0Vi9YcyuWq+aBEjHJW+kF/sNV9TsAVfVa3/bPAl/uVo8D6/t2v7xr4zztkqQxGOTpnQAPAS9U1af62tf2dfsp4LlueR+wNcnbklwBbAS+CjwFbExyRZIL6d3s3TeaMiRJgxjkSv/9wM8Ch5M807X9CnBLkqvoTe8cBX4eoKqOJHmE3g3aM8AdVfVtgCR3Ao8CFwC7qurICGuRJC1gkKd3vgJkjk37z7PPfcB9c7TvP99+kqTl5TtyJakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDVkw9JOsT/J4kueTHEnyS137u5IcSPJS9/3Srj1JPpNkJsmzSa7uO9a2rv9LSbYtX1mSpLkMcqV/BrirqjYB1wJ3JNkE3A08VlUbgce6dYCbgI3d13bgQei9SAD3AO8DrgHuOftCIUkajwVDv6pOVNUfdMt/AbwArAO2ALu7bruBD3bLW4DPV89B4JIka4EbgQNVdaqqXgcOAJtHWo0k6bwWNaefZAPwXuBJYKKqTnSb/hSY6JbXAa/27Xasa5uvXZI0JmsG7ZjkYuC3gV+uqj9P8p1tVVVJahQDSrKd3rQQExMTTE9PL/lYExfBXVeeGcWwFmWYMQ9rdnZ2Rc8/bq3VC9bciuWqeaDQT/JWeoH/cFX9Ttf8WpK1VXWim7452bUfB9b37X5513YcmDqnffrcc1XVTmAnwOTkZE1NTZ3bZWD3P7yXHYcHfl0bmaO3To39nGdNT08zzM9stWmtXrDmVixXzYM8vRPgIeCFqvpU36Z9wNkncLYBe/vab+ue4rkWON1NAz0K3JDk0u4G7g1dmyRpTAa5DH4/8LPA4STPdG2/AnwCeCTJ7cDXgQ912/YDNwMzwBvARwCq6lSSe4Gnun4fr6pTI6lCkjSQBUO/qr4CZJ7N18/Rv4A75jnWLmDXYgYoSRod35ErSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyIKhn2RXkpNJnutr+1iS40me6b5u7tv20SQzSV5McmNf++aubSbJ3aMvRZK0kEGu9D8HbJ6j/dNVdVX3tR8gySZgK/Cebp9fTXJBkguAB4CbgE3ALV1fSdIYrVmoQ1U9kWTDgMfbAuypqjeBV5LMANd022aq6mWAJHu6vs8vesSSpCUbZk7/ziTPdtM/l3Zt64BX+/oc69rma5ckjdGCV/rzeBC4F6ju+w7g50YxoCTbge0AExMTTE9PL/lYExfBXVeeGcWwFmWYMQ9rdnZ2Rc8/bq3VC9bciuWqeUmhX1WvnV1O8lngy93qcWB9X9fLuzbO037usXcCOwEmJydrampqKUME4P6H97Lj8FJf15bu6K1TYz/nWdPT0wzzM1ttWqsXrLkVy1XzkqZ3kqztW/0p4OyTPfuArUneluQKYCPwVeApYGOSK5JcSO9m776lD1uStBQLXgYn+QIwBVyW5BhwDzCV5Cp60ztHgZ8HqKojSR6hd4P2DHBHVX27O86dwKPABcCuqjoy8mokSec1yNM7t8zR/NB5+t8H3DdH+35g/6JGJ0kaKd+RK0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JasiCoZ9kV5KTSZ7ra3tXkgNJXuq+X9q1J8lnkswkeTbJ1X37bOv6v5Rk2/KUI0k6n0Gu9D8HbD6n7W7gsaraCDzWrQPcBGzsvrYDD0LvRQK4B3gfcA1wz9kXCknS+CwY+lX1BHDqnOYtwO5ueTfwwb72z1fPQeCSJGuBG4EDVXWqql4HDvD/v5BIkpbZUuf0J6rqRLf8p8BEt7wOeLWv37Gubb52SdIYrRn2AFVVSWoUgwFIsp3e1BATExNMT08v+VgTF8FdV54Z0cgGN8yYhzU7O7ui5x+31uoFa27FctW81NB/LcnaqjrRTd+c7NqPA+v7+l3etR0Hps5pn57rwFW1E9gJMDk5WVNTU3N1G8j9D+9lx+GhX9cW7eitU2M/51nT09MM8zNbbVqrF6y5FctV81Knd/YBZ5/A2Qbs7Wu/rXuK51rgdDcN9ChwQ5JLuxu4N3RtkqQxWvAyOMkX6F2lX5bkGL2ncD4BPJLkduDrwIe67vuBm4EZ4A3gIwBVdSrJvcBTXb+PV9W5N4clSctswdCvqlvm2XT9HH0LuGOe4+wCdi1qdJKkkfIduZLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUkKFCP8nRJIeTPJPkUNf2riQHkrzUfb+0a0+SzySZSfJskqtHUYAkaXCjuNK/rqquqqrJbv1u4LGq2gg81q0D3ARs7L62Aw+O4NySpEVYjumdLcDubnk38MG+9s9Xz0HgkiRrl+H8kqR5pKqWvnPyCvA6UMCvV9XOJN+sqku67QFer6pLknwZ+ERVfaXb9hjwr6vq0DnH3E7vLwEmJib+/p49e5Y8vpOnTvPa/1ry7kt25bp3jv+kndnZWS6++OIVO/+4tVYvWHMrhqn5uuuue7pv9uW7rBlqVPAPqup4kh8EDiT57/0bq6qSLOpVpap2AjsBJicna2pqasmDu//hvew4PGyJi3f01qmxn/Os6elphvmZrTat1QvW3Irlqnmo6Z2qOt59Pwl8CbgGeO3stE33/WTX/Tiwvm/3y7s2SdKYLDn0k7w9yTvOLgM3AM8B+4BtXbdtwN5ueR9wW/cUz7XA6ao6seSRS5IWbZi5jwngS71pe9YAv1lV/yXJU8AjSW4Hvg58qOu/H7gZmAHeAD4yxLklSUuw5NCvqpeBH52j/RvA9XO0F3DHUs8nSRqe78iVpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1ZOyhn2RzkheTzCS5e9znl6SWjTX0k1wAPADcBGwCbkmyaZxjkKSWjftK/xpgpqperqr/DewBtox5DJLUrDVjPt864NW+9WPA+/o7JNkObO9WZ5O8OMT5LgP+bIj9lySfHPcZv8uK1LyCWqsXrLkVw9T8d+bbMO7QX1BV7QR2juJYSQ5V1eQojrVatFZza/WCNbdiuWoe9/TOcWB93/rlXZskaQzGHfpPARuTXJHkQmArsG/MY5CkZo11eqeqziS5E3gUuADYVVVHlvGUI5kmWmVaq7m1esGaW7EsNaeqluO4kqTvQb4jV5IaYuhLUkNWfegv9LEOSd6W5Ivd9ieTbBj/KEdrgJr/ZZLnkzyb5LEk8z6zu1oM+vEdSf5Jkkqy6h/vG6TmJB/qftdHkvzmuMc4agP82/7bSR5P8rXu3/fNKzHOUUmyK8nJJM/Nsz1JPtP9PJ5NcvXQJ62qVftF72bwHwN/F7gQ+ENg0zl9fhH4tW55K/DFlR73GGq+Dvgb3fIvtFBz1+8dwBPAQWBypcc9ht/zRuBrwKXd+g+u9LjHUPNO4Be65U3A0ZUe95A1/0PgauC5ebbfDPxnIMC1wJPDnnO1X+kP8rEOW4Dd3fJvAdcnyRjHOGoL1lxVj1fVG93qQXrvh1jNBv34jnuBTwJ/Oc7BLZNBav5nwANV9TpAVZ0c8xhHbZCaC/ib3fI7gf8xxvGNXFU9AZw6T5ctwOer5yBwSZK1w5xztYf+XB/rsG6+PlV1BjgNfP9YRrc8Bqm53+30rhRWswVr7v7sXV9VvzvOgS2jQX7P7wbeneS/JjmYZPPYRrc8Bqn5Y8DPJDkG7Af++XiGtmIW+9/7gr7nPoZBo5PkZ4BJ4B+t9FiWU5K3AJ8CPrzCQxm3NfSmeKbo/TX3RJIrq+qbKzqq5XUL8Lmq2pHkx4DfSPIjVfV/V3pgq8Vqv9If5GMdvtMnyRp6fxJ+YyyjWx4DfZRFkn8M/BvgA1X15pjGtlwWqvkdwI8A00mO0pv73LfKb+YO8ns+Buyrqv9TVa8Af0TvRWC1GqTm24FHAKrqvwHfR++Dyf66GvlH16z20B/kYx32Adu65Z8Gfr+6OySr1II1J3kv8Ov0An+1z/PCAjVX1emquqyqNlTVBnr3MT5QVYdWZrgjMci/7f9E7yqfJJfRm+55eZyDHLFBav4T4HqAJD9ML/T/51hHOV77gNu6p3iuBU5X1YlhDriqp3dqno91SPJx4FBV7QMeovcn4Ay9GyZbV27Ewxuw5n8PXAz8x+6e9Z9U1QdWbNBDGrDmv1YGrPlR4IYkzwPfBv5VVa3av2IHrPku4LNJ/gW9m7ofXs0XcUm+QO+F+7LuPsU9wFsBqurX6N23uBmYAd4APjL0OVfxz0uStEirfXpHkrQIhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyP8Dvlr/cjC7XHoAAAAASUVORK5CYII=\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "id": "-x92m5SxAtyi", "outputId": "9917e1eb-5855-4b00-d703-dda07267e1d5" }, "source": [ "tf_count = pd.DataFrame()\n", "for c in vectors.columns:\n", " tf_count=vectors[vectors[c]>0].sum()\n", "tf_count.hist()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<matplotlib.axes._subplots.AxesSubplot at 0x7f95085f89e8>" ] }, "metadata": { "tags": [] }, "execution_count": 16 }, { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAR3klEQVR4nO3dcYxd5Xnn8e+vQFJE0oWU7MhrWzVSvbsiRSXRiLBKtZpNFDCkKlTajUBs4qZI7h8gJVpLK9J/aJsipdKSrIJSJHexSna9oahJZCtBpV7KVRRpCeCUYAylTIkjbBGsFkIyiTYrp0//mHfSWzr2zNx750487/cjXd1zn/Oec95nEL97OPfcS6oKSVIffmajJyBJmh5DX5I6YuhLUkcMfUnqiKEvSR05f6MncDaXXnpp7dixY+Ttf/CDH3DRRRdNbkLngN567q1fsOdejNPzkSNH/raq3r7cup/q0N+xYwdPPvnkyNsPBgPm5uYmN6FzQG8999Yv2HMvxuk5ybfPtM7LO5LUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JGf6m/kjuvoydf5jTu+MvXjHv/kB6Z+TElaDc/0Jakjhr4kdcTQl6SOrBj6SX42yeNJvpnkWJLfbfXLknw9yXySP0nyplZ/c3s939bvGNrXx1v9+STXrldTkqTlreZM/0fAe6vql4ErgV1Jrgb+APh0Vf0i8Bpwaxt/K/Baq3+6jSPJ5cBNwDuAXcAfJjlvks1Iks5uxdCvRQvt5QXtUcB7gT9t9fuBG9vyDe01bf37kqTVH6iqH1XVt4B54KqJdCFJWpVV3bLZzsiPAL8IfBb4G+C7VXW6DTkBbG3LW4GXAKrqdJLXgZ9v9ceGdju8zfCx9gB7AGZmZhgMBmvraMjMhbD3itMrD5ywceY8roWFhQ09/rT11i/Ycy/Wq+dVhX5V/Ri4MsnFwJeAfzvxmfzjsfYB+wBmZ2drnP9bzj0HDnL30el/FeH4LXNTP+aS3v4PQ731C/bci/XqeU1371TVd4FHgX8HXJxkKVG3ASfb8klgO0Bb/y+AvxuuL7ONJGkKVnP3ztvbGT5JLgTeDzzHYvj/xzZsN3CwLR9qr2nr/6KqqtVvanf3XAbsBB6fVCOSpJWt5trHFuD+dl3/Z4AHq+rLSZ4FHkjy+8BfAve18fcB/zPJPPAqi3fsUFXHkjwIPAucBm5rl40kSVOyYuhX1dPAO5epv8gyd99U1f8D/tMZ9nUXcNfapylJmgS/kStJHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SerIiqGfZHuSR5M8m+RYko+2+u8kOZnkqfa4fmibjyeZT/J8kmuH6rtabT7JHevTkiTpTM5fxZjTwN6q+kaStwJHkhxu6z5dVf9teHCSy4GbgHcA/wr4P0n+dVv9WeD9wAngiSSHqurZSTQiSVrZiqFfVS8DL7fl7yd5Dth6lk1uAB6oqh8B30oyD1zV1s1X1YsASR5oYw19SZqSNV3TT7IDeCfw9Va6PcnTSfYnuaTVtgIvDW12otXOVJckTclqLu8AkOQtwBeAj1XV95LcC3wCqPZ8N/Cb404oyR5gD8DMzAyDwWDkfc1cCHuvOD3ulNZsnDmPa2FhYUOPP2299Qv23Iv16nlVoZ/kAhYD/0BVfRGgql4ZWv9HwJfby5PA9qHNt7UaZ6n/RFXtA/YBzM7O1tzc3GqmuKx7Dhzk7qOrfl+bmOO3zE39mEsGgwHj/M3ONb31C/bci/XqeTV37wS4D3iuqj41VN8yNOzXgWfa8iHgpiRvTnIZsBN4HHgC2JnksiRvYvHD3kOTaUOStBqrOQ1+D/Ah4GiSp1rtt4Gbk1zJ4uWd48BvAVTVsSQPsvgB7Wngtqr6MUCS24GHgfOA/VV1bIK9SJJWsJq7d74GZJlVD51lm7uAu5apP3S27SRJ68tv5EpSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR1YM/STbkzya5Nkkx5J8tNXfluRwkhfa8yWtniSfSTKf5Okk7xra1+42/oUku9evLUnSclZzpn8a2FtVlwNXA7cluRy4A3ikqnYCj7TXANcBO9tjD3AvLL5JAHcC7wauAu5ceqOQJE3HiqFfVS9X1Tfa8veB54CtwA3A/W3Y/cCNbfkG4HO16DHg4iRbgGuBw1X1alW9BhwGdk20G0nSWZ2/lsFJdgDvBL4OzFTVy23Vd4CZtrwVeGlosxOtdqb6G4+xh8X/QmBmZobBYLCWKf4TMxfC3itOj7z9qMaZ87gWFhY29PjT1lu/YM+9WK+eVx36Sd4CfAH4WFV9L8lP1lVVJalJTKiq9gH7AGZnZ2tubm7kfd1z4CB3H13T+9pEHL9lburHXDIYDBjnb3au6a1fsOderFfPq7p7J8kFLAb+gar6Yiu/0i7b0J5PtfpJYPvQ5tta7Ux1SdKUrObunQD3Ac9V1aeGVh0Clu7A2Q0cHKp/uN3FczXwersM9DBwTZJL2ge417SaJGlKVnPt4z3Ah4CjSZ5qtd8GPgk8mORW4NvAB9u6h4DrgXngh8BHAKrq1SSfAJ5o436vql6dSBeSpFVZMfSr6mtAzrD6fcuML+C2M+xrP7B/LROUJE2O38iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1ZMXQT7I/yakkzwzVfifJySRPtcf1Q+s+nmQ+yfNJrh2q72q1+SR3TL4VSdJKVnOm/8fArmXqn66qK9vjIYAklwM3Ae9o2/xhkvOSnAd8FrgOuBy4uY2VJE3R+SsNqKqvJtmxyv3dADxQVT8CvpVkHriqrZuvqhcBkjzQxj675hlLkka2Yuifxe1JPgw8CeytqteArcBjQ2NOtBrAS2+ov3u5nSbZA+wBmJmZYTAYjDzBmQth7xWnR95+VOPMeVwLCwsbevxp661fsOderFfPo4b+vcAngGrPdwO/OYkJVdU+YB/A7Oxszc3Njbyvew4c5O6j47yvjeb4LXNTP+aSwWDAOH+zc01v/YI992K9eh4pEavqlaXlJH8EfLm9PAlsHxq6rdU4S12SNCUj3bKZZMvQy18Hlu7sOQTclOTNSS4DdgKPA08AO5NcluRNLH7Ye2j0aUuSRrHimX6SzwNzwKVJTgB3AnNJrmTx8s5x4LcAqupYkgdZ/ID2NHBbVf247ed24GHgPGB/VR2beDeSpLNazd07Ny9Tvu8s4+8C7lqm/hDw0JpmJ0maKL+RK0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6siKoZ9kf5JTSZ4Zqr0tyeEkL7TnS1o9ST6TZD7J00neNbTN7jb+hSS716cdSdLZrOZM/4+BXW+o3QE8UlU7gUfaa4DrgJ3tsQe4FxbfJIA7gXcDVwF3Lr1RSJKmZ8XQr6qvAq++oXwDcH9bvh+4caj+uVr0GHBxki3AtcDhqnq1ql4DDvPP30gkSevs/BG3m6mql9vyd4CZtrwVeGlo3IlWO1P9n0myh8X/SmBmZobBYDDiFGHmQth7xemRtx/VOHMe18LCwoYef9p66xfsuRfr1fOoof8TVVVJahKTafvbB+wDmJ2drbm5uZH3dc+Bg9x9dOwW1+z4LXNTP+aSwWDAOH+zc01v/YI992K9eh717p1X2mUb2vOpVj8JbB8at63VzlSXJE3RqKF/CFi6A2c3cHCo/uF2F8/VwOvtMtDDwDVJLmkf4F7TapKkKVrx2keSzwNzwKVJTrB4F84ngQeT3Ap8G/hgG/4QcD0wD/wQ+AhAVb2a5BPAE23c71XVGz8cliStsxVDv6puPsOq9y0ztoDbzrCf/cD+Nc1OkjRRfiNXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkbFCP8nxJEeTPJXkyVZ7W5LDSV5oz5e0epJ8Jsl8kqeTvGsSDUiSVm8SZ/r/oaqurKrZ9voO4JGq2gk80l4DXAfsbI89wL0TOLYkaQ3W4/LODcD9bfl+4Mah+udq0WPAxUm2rMPxJUlnMG7oF/DnSY4k2dNqM1X1clv+DjDTlrcCLw1te6LVJElTcv6Y2/9KVZ1M8i+Bw0n+anhlVVWSWssO25vHHoCZmRkGg8HIk5u5EPZecXrk7Uc1zpzHtbCwsKHHn7be+gV77sV69TxW6FfVyfZ8KsmXgKuAV5JsqaqX2+WbU234SWD70ObbWu2N+9wH7AOYnZ2tubm5ked3z4GD3H103Pe1tTt+y9zUj7lkMBgwzt/sXNNbv2DPvVivnke+vJPkoiRvXVoGrgGeAQ4Bu9uw3cDBtnwI+HC7i+dq4PWhy0CSpCkY5zR4BvhSkqX9/O+q+rMkTwAPJrkV+DbwwTb+IeB6YB74IfCRMY4tSRrByKFfVS8Cv7xM/e+A9y1TL+C2UY8nSRqf38iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI5MPfST7EryfJL5JHdM+/iS1LOphn6S84DPAtcBlwM3J7l8mnOQpJ6dP+XjXQXMV9WLAEkeAG4Anp3yPDatoydf5zfu+MrUj3v8kx+Y+jElrd20Q38r8NLQ6xPAu4cHJNkD7GkvF5I8P8bxLgX+doztR5I/mPYR/4neet6QfjeYPfdhnJ5/4Uwrph36K6qqfcC+SewryZNVNTuJfZ0reuu5t37BnnuxXj1P+4Pck8D2odfbWk2SNAXTDv0ngJ1JLkvyJuAm4NCU5yBJ3Zrq5Z2qOp3kduBh4Dxgf1UdW8dDTuQy0Tmmt5576xfsuRfr0nOqaj32K0n6KeQ3ciWpI4a+JHVkU4Z+bz/1kGR/klNJntnouUxLku1JHk3ybJJjST660XNab0l+NsnjSb7Zev7djZ7TNCQ5L8lfJvnyRs9lWpIcT3I0yVNJnpzovjfbNf32Uw9/DbyfxS9/PQHcXFWb9lu/Sf49sAB8rqp+aaPnMw1JtgBbquobSd4KHAFu3OT/nANcVFULSS4AvgZ8tKoe2+Cprask/wWYBX6uqn51o+czDUmOA7NVNfEvpG3GM/2f/NRDVf1/YOmnHjatqvoq8OpGz2OaqurlqvpGW/4+8ByL3/jetGrRQnt5QXtsrrO2N0iyDfgA8D82ei6bxWYM/eV+6mFTh0HvkuwA3gl8fWNnsv7apY6ngFPA4ara7D3/d+C/An+/0ROZsgL+PMmR9tM0E7MZQ18dSfIW4AvAx6rqexs9n/VWVT+uqitZ/Db7VUk27eW8JL8KnKqqIxs9lw3wK1X1LhZ/kfi2dgl3IjZj6PtTD51o17W/AByoqi9u9Hymqaq+CzwK7Nrouayj9wC/1q5vPwC8N8n/2tgpTUdVnWzPp4AvsXjZeiI2Y+j7Uw8daB9q3gc8V1Wf2uj5TEOStye5uC1fyOLNCn+1sbNaP1X18araVlU7WPz3+C+q6j9v8LTWXZKL2s0JJLkIuAaY2J15my70q+o0sPRTD88BD67zTz1suCSfB/4v8G+SnEhy60bPaQreA3yIxbO/p9rj+o2e1DrbAjya5GkWT24OV1U3tzF2ZAb4WpJvAo8DX6mqP5vUzjfdLZuSpDPbdGf6kqQzM/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR/4B7YmFZUWg8TsAAAAASUVORK5CYII=\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 282 }, "id": "fBtA8AL0A7X_", "outputId": "9788036c-dafc-45f1-ac52-cc0b9508b03c" }, "source": [ "tf_weighed = pd.DataFrame()\n", "for c in log_tf.columns:\n", " tf_weighed[c] = log_tf[c]*data[c]\n", "\n", "for c in vectors.columns:\n", " tf_count=tf_weighed[tf_weighed[c]>0].sum()\n", "tf_count.hist()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "<matplotlib.axes._subplots.AxesSubplot at 0x7f9507bca9e8>" ] }, "metadata": { "tags": [] }, "execution_count": 18 }, { "output_type": "display_data", "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASkklEQVR4nO3df4xl5X3f8ffHrHGRcQw27oQu2y5SNqpwabAzwkRpm3Gw+VXJEDV1QTSsHaSNFJAchUrFyR+kpkikLaYydaxswirriGZDnFi7smnpZsPIslQwkFDWC6VM8FrsFrOKwSRjGqp1v/1jnrWuyezO3Zk7Z3b2eb+kq3vOc557z/c7l/3cM2fOvaSqkCT14S1rXYAkaTiGviR1xNCXpI4Y+pLUEUNfkjqyYa0LOJHzzjuvNm/evNZlnLTvfve7vP3tb1/rMgZlz32w5/XhySef/Iuqes9i207p0N+8eTNPPPHEWpdx0mZnZ5mZmVnrMgZlz32w5/UhyTePt83TO5LUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JFT+hO50qls/+HX+NjtXx58vwfv/qeD71OnD4/0Jakjhr4kdcTQl6SOGPqS1JElQz/J30rytST/I8mBJP+mjV+Y5LEkc0l+P8mZbfxtbX2ubd888lyfbOPPJblytZqSJC1unCP9N4CfrqofAy4BrkpyGfDrwL1V9SPAq8DNbf7NwKtt/N42jyQXAdcD7wWuAn4jyRmTbEaSdGJLhn4tmG+rb223An4a+EIb3wlc15avbeu07ZcnSRvfVVVvVNU3gDng0ol0IUkay1jn9JOckeQp4AiwF/hz4DtVdbRNOQRsbMsbgRcB2vbXgHePji/yGEnSAMb6cFZVfQ+4JMk5wBeBv79aBSXZBmwDmJqaYnZ2drV2tWrm5+fXZd0r0WPPU2fBbRcfXXrihK3lz7nH1/l06/mkPpFbVd9J8gjwE8A5STa0o/kLgMNt2mFgE3AoyQbgncC3R8aPGX3M6D62A9sBpqena739vylhff4/NVeqx57ve2A39+wf/kPtB2+cGXyfx/T4Op9uPY9z9c572hE+Sc4CPgw8CzwC/GybthXY3Zb3tHXa9j+pqmrj17erey4EtgBfm1QjkqSljXOYcj6ws11p8xbgwar6UpJngF1J/i3wZ8D9bf79wO8mmQNeYeGKHarqQJIHgWeAo8At7bSRJGkgS4Z+VT0NvG+R8RdY5Oqbqvpr4J8f57nuAu46+TIlSZPgJ3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdWTL0k2xK8kiSZ5IcSPKJNv5rSQ4neardrhl5zCeTzCV5LsmVI+NXtbG5JLevTkuSpOPZMMaco8BtVfWnSd4BPJlkb9t2b1X9h9HJSS4CrgfeC/wd4I+T/Gjb/Fngw8Ah4PEke6rqmUk0Ikla2pKhX1UvAS+15b9K8iyw8QQPuRbYVVVvAN9IMgdc2rbNVdULAEl2tbmGviQN5KTO6SfZDLwPeKwN3Zrk6SQ7kpzbxjYCL4487FAbO964JGkg45zeASDJ2cAfAr9UVX+Z5HPAnUC1+3uAn19pQUm2AdsApqammJ2dXelTDm5+fn5d1r0SPfY8dRbcdvHRwfe7lj/nHl/n063nsUI/yVtZCPwHquqPAKrq5ZHtvwV8qa0eBjaNPPyCNsYJxr+vqrYD2wGmp6drZmZmnBJPKbOzs6zHuleix57ve2A39+wf+7hpYg7eODP4Po/p8XU+3Xoe5+qdAPcDz1bVp0fGzx+Z9jPA19vyHuD6JG9LciGwBfga8DiwJcmFSc5k4Y+9eybThiRpHOMcpvwk8HPA/iRPtbFfAW5IcgkLp3cOAr8AUFUHkjzIwh9ojwK3VNX3AJLcCjwMnAHsqKoDE+xFkrSEca7e+SqQRTY9dILH3AXctcj4Qyd6nCRpdfmJXEnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqyJKhn2RTkkeSPJPkQJJPtPF3Jdmb5Pl2f24bT5LPJJlL8nSS948819Y2//kkW1evLUnSYsY50j8K3FZVFwGXAbckuQi4HdhXVVuAfW0d4GpgS7ttAz4HC28SwB3AB4BLgTuOvVFIkoaxZOhX1UtV9adt+a+AZ4GNwLXAzjZtJ3BdW74W+HwteBQ4J8n5wJXA3qp6papeBfYCV020G0nSCW04mclJNgPvAx4DpqrqpbbpW8BUW94IvDjysENt7Hjjb97HNhZ+Q2BqaorZ2dmTKfGUMD8/vy7rXokee546C267+Ojg+13Ln3OPr/Pp1vPYoZ/kbOAPgV+qqr9M8v1tVVVJahIFVdV2YDvA9PR0zczMTOJpBzU7O8t6rHsleuz5vgd2c8/+kzpumoiDN84Mvs9jenydT7eex7p6J8lbWQj8B6rqj9rwy+20De3+SBs/DGwaefgFbex445KkgYxz9U6A+4Fnq+rTI5v2AMeuwNkK7B4Zv6ldxXMZ8Fo7DfQwcEWSc9sfcK9oY5KkgYzzu+lPAj8H7E/yVBv7FeBu4MEkNwPfBD7atj0EXAPMAa8DHweoqleS3Ak83uZ9qqpemUgXkqSxLBn6VfVVIMfZfPki8wu45TjPtQPYcTIFSpImx0/kSlJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOrJk6CfZkeRIkq+PjP1aksNJnmq3a0a2fTLJXJLnklw5Mn5VG5tLcvvkW5EkLWWcI/3fAa5aZPzeqrqk3R4CSHIRcD3w3vaY30hyRpIzgM8CVwMXATe0uZKkAW1YakJVfSXJ5jGf71pgV1W9AXwjyRxwads2V1UvACTZ1eY+c9IVS5KWbcnQP4Fbk9wEPAHcVlWvAhuBR0fmHGpjAC++afwDiz1pkm3ANoCpqSlmZ2dXUOLamJ+fX5d1r0SPPU+dBbddfHTw/a7lz7nH1/l063m5of854E6g2v09wM9PoqCq2g5sB5ienq6ZmZlJPO2gZmdnWY91r0SPPd/3wG7u2b+S46blOXjjzOD7PKbH1/l063lZ/8VW1cvHlpP8FvCltnoY2DQy9YI2xgnGJUkDWdYlm0nOH1n9GeDYlT17gOuTvC3JhcAW4GvA48CWJBcmOZOFP/buWX7ZkqTlWPJIP8nvATPAeUkOAXcAM0kuYeH0zkHgFwCq6kCSB1n4A+1R4Jaq+l57nluBh4EzgB1VdWDi3UiSTmicq3duWGT4/hPMvwu4a5Hxh4CHTqo6SdJE+YlcSeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR5YM/SQ7khxJ8vWRsXcl2Zvk+XZ/bhtPks8kmUvydJL3jzxma5v/fJKtq9OOJOlExjnS/x3gqjeN3Q7sq6otwL62DnA1sKXdtgGfg4U3CeAO4APApcAdx94oJEnDWTL0q+orwCtvGr4W2NmWdwLXjYx/vhY8CpyT5HzgSmBvVb1SVa8Ce/mbbySSpFW2YZmPm6qql9ryt4CptrwReHFk3qE2drzxvyHJNhZ+S2BqaorZ2dlllrh25ufn12XdK9Fjz1NnwW0XHx18v2v5c+7xdT7del5u6H9fVVWSmkQx7fm2A9sBpqena2ZmZlJPPZjZ2VnWY90r0WPP9z2wm3v2r/if0Ek7eOPM4Ps8psfX+XTreblX77zcTtvQ7o+08cPAppF5F7Sx441Lkga03NDfAxy7AmcrsHtk/KZ2Fc9lwGvtNNDDwBVJzm1/wL2ijUmSBrTk76ZJfg+YAc5LcoiFq3DuBh5McjPwTeCjbfpDwDXAHPA68HGAqnolyZ3A423ep6rqzX8cliStsiVDv6puOM6myxeZW8Atx3meHcCOk6pOkjRRfiJXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkRWFfpKDSfYneSrJE23sXUn2Jnm+3Z/bxpPkM0nmkjyd5P2TaECSNL5JHOl/sKouqarptn47sK+qtgD72jrA1cCWdtsGfG4C+5YknYTVOL1zLbCzLe8ErhsZ/3wteBQ4J8n5q7B/SdJxpKqW/+DkG8CrQAG/WVXbk3ynqs5p2wO8WlXnJPkScHdVfbVt2wf866p64k3PuY2F3wSYmpr68V27di27vrUyPz/P2WefvdZlDKrHno+88hov/5/h93vxxncOv9Omx9d5Pfb8wQ9+8MmRsy8/YMMKn/sfVdXhJH8b2Jvkf45urKpKclLvKlW1HdgOMD09XTMzMysscXizs7Osx7pXosee73tgN/fsX+k/oZN38MaZwfd5TI+v8+nW84pO71TV4XZ/BPgicCnw8rHTNu3+SJt+GNg08vAL2pgkaSDLDv0kb0/yjmPLwBXA14E9wNY2bSuwuy3vAW5qV/FcBrxWVS8tu3JJ0klbye+mU8AXF07bswH4z1X1X5M8DjyY5Gbgm8BH2/yHgGuAOeB14OMr2LckaRmWHfpV9QLwY4uMfxu4fJHxAm5Z7v4kSSvnJ3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdGTz0k1yV5Lkkc0luH3r/ktSzQUM/yRnAZ4GrgYuAG5JcNGQNktSzoY/0LwXmquqFqvq/wC7g2oFrkKRubRh4fxuBF0fWDwEfGJ2QZBuwra3OJ3luoNom6TzgL9a6iIHZ80Dy60Pv8Qf4Oq8Pf+94G4YO/SVV1XZg+1rXsRJJnqiq6bWuY0j23Ad7Xv+GPr1zGNg0sn5BG5MkDWDo0H8c2JLkwiRnAtcDewauQZK6Nejpnao6muRW4GHgDGBHVR0YsoaBrOvTU8tkz32w53UuVbXWNUiSBuInciWpI4a+JHXE0J+AJO9KsjfJ8+3+3BPM/aEkh5L8pyFrnLRxek5ySZL/nuRAkqeT/Iu1qHWllvrqkCRvS/L7bftjSTYPX+XkjNHvLyd5pr2m+5Ic95rw9WLcr4dJ8s+SVJJ1ewmnoT8ZtwP7qmoLsK+tH8+dwFcGqWp1jdPz68BNVfVe4CrgPyY5Z8AaV2zMrw65GXi1qn4EuBdY249PrcCY/f4ZMF1V/xD4AvDvhq1yssb9epgk7wA+ATw2bIWTZehPxrXAzra8E7husUlJfhyYAv7bQHWtpiV7rqr/VVXPt+X/DRwB3jNYhZMxzleHjP4svgBcniQD1jhJS/ZbVY9U1ett9VEWPm+zno379TB3svCG/tdDFjdphv5kTFXVS235WywE+w9I8hbgHuBfDVnYKlqy51FJLgXOBP58tQubsMW+OmTj8eZU1VHgNeDdg1Q3eeP0O+pm4L+sakWrb8mek7wf2FRVXx6ysNVwyn0Nw6kqyR8DP7zIpl8dXamqSrLYdbC/CDxUVYfWy0HgBHo+9jznA78LbK2q/zfZKrVWkvxLYBr4qbWuZTW1A7ZPAx9b41ImwtAfU1V96Hjbkryc5PyqeqkF3JFFpv0E8I+T/CJwNnBmkvmqOmX/nwIT6JkkPwR8GfjVqnp0lUpdTeN8dcixOYeSbADeCXx7mPImbqyvSknyIRbe/H+qqt4YqLbVslTP7wD+ATDbDth+GNiT5CNV9cRgVU6Ip3cmYw+wtS1vBXa/eUJV3VhVf7eqNrNwiufzp3Lgj2HJnttXbXyRhV6/MGBtkzTOV4eM/ix+FviTWr+felyy3yTvA34T+EhVLfpmv86csOeqeq2qzquqze3f76Ms9L7uAh8M/Um5G/hwkueBD7V1kkwn+e01rWz1jNPzR4F/AnwsyVPtdsnalLs87Rz9sa8OeRZ4sKoOJPlUko+0afcD704yB/wyJ75665Q2Zr//noXfVv+gvabr+vuzxuz5tOHXMEhSRzzSl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI/8fdczOZxEND9UAAAAASUVORK5CYII=\n", "text/plain": [ "<Figure size 432x288 with 1 Axes>" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 513 }, "id": "OzVHi0peDNFY", "outputId": "921fbade-d784-4bd8-e463-0870d24d7dcc" }, "source": [ "def most_frequent(List): \n", " counter = 0\n", " num = List[0] \n", " \n", " for i in List: \n", " curr_frequency = List.count(i) \n", " if(curr_frequency> counter): \n", " counter = curr_frequency \n", " num = i \n", " \n", " return num\n", " \n", "df['most_common'] = df['tokens'].apply(most_frequent)\n", "\n", "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>video_id</th>\n", " <th>trending_date</th>\n", " <th>title</th>\n", " <th>channel_title</th>\n", " <th>category_id</th>\n", " <th>publish_time</th>\n", " <th>tags</th>\n", " <th>views</th>\n", " <th>likes</th>\n", " <th>dislikes</th>\n", " <th>comment_count</th>\n", " <th>thumbnail_link</th>\n", " <th>comments_disabled</th>\n", " <th>ratings_disabled</th>\n", " <th>video_error_or_removed</th>\n", " <th>description</th>\n", " <th>text</th>\n", " <th>tokens</th>\n", " <th>most_common</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2kyS6SvSYSE</td>\n", " <td>17.14.11</td>\n", " <td>WE WANT TO TALK ABOUT OUR MARRIAGE</td>\n", " <td>CaseyNeistat</td>\n", " <td>22</td>\n", " <td>2017-11-13T17:13:01.000Z</td>\n", " <td>SHANtell martin</td>\n", " <td>748374</td>\n", " <td>57527</td>\n", " <td>2966</td>\n", " <td>15954</td>\n", " <td>https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>SHANTELL'S CHANNEL - https://www.youtube.com/s...</td>\n", " <td>WE WANT TO TALK ABOUT OUR MARRIAGESHANtell mar...</td>\n", " <td>[want, talk, marriageshantell, martincaseyneis...</td>\n", " <td>want</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1ZAPwfrtAFY</td>\n", " <td>17.14.11</td>\n", " <td>The Trump Presidency: Last Week Tonight with J...</td>\n", " <td>LastWeekTonight</td>\n", " <td>24</td>\n", " <td>2017-11-13T07:30:00.000Z</td>\n", " <td>last week tonight trump presidency|\"last week ...</td>\n", " <td>2418783</td>\n", " <td>97185</td>\n", " <td>6146</td>\n", " <td>12703</td>\n", " <td>https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>One year after the presidential election, John...</td>\n", " <td>The Trump Presidency: Last Week Tonight with J...</td>\n", " <td>[trump, presidency, last, week, tonight, john,...</td>\n", " <td>trump</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>5qpjK5DgCt4</td>\n", " <td>17.14.11</td>\n", " <td>Racist Superman | Rudy Mancuso, King Bach & Le...</td>\n", " <td>Rudy Mancuso</td>\n", " <td>23</td>\n", " <td>2017-11-12T19:05:24.000Z</td>\n", " <td>racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"...</td>\n", " <td>3191434</td>\n", " <td>146033</td>\n", " <td>5339</td>\n", " <td>8181</td>\n", " <td>https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>WATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► http...</td>\n", " <td>Racist Superman | Rudy Mancuso, King Bach & Le...</td>\n", " <td>[racist, superman, rudy, mancuso, king, bach, ...</td>\n", " <td>rudy</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>puqaWrEC7tY</td>\n", " <td>17.14.11</td>\n", " <td>Nickelback Lyrics: Real or Fake?</td>\n", " <td>Good Mythical Morning</td>\n", " <td>24</td>\n", " <td>2017-11-13T11:00:04.000Z</td>\n", " <td>rhett and link|\"gmm\"|\"good mythical morning\"|\"...</td>\n", " <td>343168</td>\n", " <td>10172</td>\n", " <td>666</td>\n", " <td>2146</td>\n", " <td>https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>Today we find out if Link is a Nickelback amat...</td>\n", " <td>Nickelback Lyrics: Real or Fake?rhett and link...</td>\n", " <td>[nickelback, lyric, real, fake, rhett, link, g...</td>\n", " <td>nickelback</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>d380meD0W0M</td>\n", " <td>17.14.11</td>\n", " <td>I Dare You: GOING BALD!?</td>\n", " <td>nigahiga</td>\n", " <td>24</td>\n", " <td>2017-11-12T18:01:41.000Z</td>\n", " <td>ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"...</td>\n", " <td>2095731</td>\n", " <td>132235</td>\n", " <td>1989</td>\n", " <td>17518</td>\n", " <td>https://i.ytimg.com/vi/d380meD0W0M/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>I know it's been a while since we did this sho...</td>\n", " <td>I Dare You: GOING BALD!?ryan|\"higa\"|\"higatv\"|\"...</td>\n", " <td>[dare, going, bald, ryan, higa, higatv, nigahi...</td>\n", " <td>dare</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " video_id ... most_common\n", "0 2kyS6SvSYSE ... want\n", "1 1ZAPwfrtAFY ... trump\n", "2 5qpjK5DgCt4 ... rudy\n", "3 puqaWrEC7tY ... nickelback\n", "4 d380meD0W0M ... dare\n", "\n", "[5 rows x 19 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "liU74o_DGYAR", "outputId": "924602e2-575d-4908-d877-6d1bf1d32df9" }, "source": [ "print(df.dtypes)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "video_id object\n", "trending_date object\n", "title object\n", "channel_title object\n", "category_id int64\n", "publish_time object\n", "tags object\n", "views int64\n", "likes int64\n", "dislikes int64\n", "comment_count int64\n", "thumbnail_link object\n", "comments_disabled bool\n", "ratings_disabled bool\n", "video_error_or_removed bool\n", "description object\n", "text object\n", "tokens object\n", "most_common object\n", "dtype: object\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JtvJsnOGGo8H", "outputId": "b06f679a-6816-42b2-872a-504832244000" }, "source": [ "from collections import Counter\n", "one = df['most_common'].to_list()\n", "\n", "rslt = pd.DataFrame(Counter(one).most_common(10),\n", " columns=['Word', 'Frequency']).set_index('Word')\n", "print(rslt)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " Frequency\n", "Word \n", "makeup 732\n", "late 342\n", "cat 316\n", "trailer 257\n", "show 219\n", "news 206\n", "movie 202\n", "react 188\n", "star 188\n", "food 165\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wPL-ckQVHiR1", "outputId": "2c746f52-850d-4206-b558-671a8f3fa178" }, "source": [ "def popular(List_1):\n", " first = rslt.index.to_list() \n", " if List_1 in first:\n", " return True\n", " else:\n", " return False \n", "\n", "df['popular_word'] = df['most_common'].apply(lambda d: popular(d))\n", "\n", "df['popular_word'].value_counts()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "False 38134\n", "True 2815\n", "Name: popular_word, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 23 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 513 }, "id": "m4eqJ3kBH6eW", "outputId": "e16bb1f5-180c-48e7-f103-c7b554de2f2e" }, "source": [ "df.head()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>video_id</th>\n", " <th>trending_date</th>\n", " <th>title</th>\n", " <th>channel_title</th>\n", " <th>category_id</th>\n", " <th>publish_time</th>\n", " <th>tags</th>\n", " <th>views</th>\n", " <th>likes</th>\n", " <th>dislikes</th>\n", " <th>comment_count</th>\n", " <th>thumbnail_link</th>\n", " <th>comments_disabled</th>\n", " <th>ratings_disabled</th>\n", " <th>video_error_or_removed</th>\n", " <th>description</th>\n", " <th>text</th>\n", " <th>tokens</th>\n", " <th>most_common</th>\n", " <th>popular_word</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2kyS6SvSYSE</td>\n", " <td>17.14.11</td>\n", " <td>WE WANT TO TALK ABOUT OUR MARRIAGE</td>\n", " <td>CaseyNeistat</td>\n", " <td>22</td>\n", " <td>2017-11-13T17:13:01.000Z</td>\n", " <td>SHANtell martin</td>\n", " <td>748374</td>\n", " <td>57527</td>\n", " <td>2966</td>\n", " <td>15954</td>\n", " <td>https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>SHANTELL'S CHANNEL - https://www.youtube.com/s...</td>\n", " <td>WE WANT TO TALK ABOUT OUR MARRIAGESHANtell mar...</td>\n", " <td>[want, talk, marriageshantell, martincaseyneis...</td>\n", " <td>want</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>1ZAPwfrtAFY</td>\n", " <td>17.14.11</td>\n", " <td>The Trump Presidency: Last Week Tonight with J...</td>\n", " <td>LastWeekTonight</td>\n", " <td>24</td>\n", " <td>2017-11-13T07:30:00.000Z</td>\n", " <td>last week tonight trump presidency|\"last week ...</td>\n", " <td>2418783</td>\n", " <td>97185</td>\n", " <td>6146</td>\n", " <td>12703</td>\n", " <td>https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>One year after the presidential election, John...</td>\n", " <td>The Trump Presidency: Last Week Tonight with J...</td>\n", " <td>[trump, presidency, last, week, tonight, john,...</td>\n", " <td>trump</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>5qpjK5DgCt4</td>\n", " <td>17.14.11</td>\n", " <td>Racist Superman | Rudy Mancuso, King Bach & Le...</td>\n", " <td>Rudy Mancuso</td>\n", " <td>23</td>\n", " <td>2017-11-12T19:05:24.000Z</td>\n", " <td>racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"...</td>\n", " <td>3191434</td>\n", " <td>146033</td>\n", " <td>5339</td>\n", " <td>8181</td>\n", " <td>https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>WATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► http...</td>\n", " <td>Racist Superman | Rudy Mancuso, King Bach & Le...</td>\n", " <td>[racist, superman, rudy, mancuso, king, bach, ...</td>\n", " <td>rudy</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>puqaWrEC7tY</td>\n", " <td>17.14.11</td>\n", " <td>Nickelback Lyrics: Real or Fake?</td>\n", " <td>Good Mythical Morning</td>\n", " <td>24</td>\n", " <td>2017-11-13T11:00:04.000Z</td>\n", " <td>rhett and link|\"gmm\"|\"good mythical morning\"|\"...</td>\n", " <td>343168</td>\n", " <td>10172</td>\n", " <td>666</td>\n", " <td>2146</td>\n", " <td>https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>Today we find out if Link is a Nickelback amat...</td>\n", " <td>Nickelback Lyrics: Real or Fake?rhett and link...</td>\n", " <td>[nickelback, lyric, real, fake, rhett, link, g...</td>\n", " <td>nickelback</td>\n", " <td>False</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>d380meD0W0M</td>\n", " <td>17.14.11</td>\n", " <td>I Dare You: GOING BALD!?</td>\n", " <td>nigahiga</td>\n", " <td>24</td>\n", " <td>2017-11-12T18:01:41.000Z</td>\n", " <td>ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"...</td>\n", " <td>2095731</td>\n", " <td>132235</td>\n", " <td>1989</td>\n", " <td>17518</td>\n", " <td>https://i.ytimg.com/vi/d380meD0W0M/default.jpg</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>False</td>\n", " <td>I know it's been a while since we did this sho...</td>\n", " <td>I Dare You: GOING BALD!?ryan|\"higa\"|\"higatv\"|\"...</td>\n", " <td>[dare, going, bald, ryan, higa, higatv, nigahi...</td>\n", " <td>dare</td>\n", " <td>False</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " video_id trending_date ... most_common popular_word\n", "0 2kyS6SvSYSE 17.14.11 ... want False\n", "1 1ZAPwfrtAFY 17.14.11 ... trump False\n", "2 5qpjK5DgCt4 17.14.11 ... rudy False\n", "3 puqaWrEC7tY 17.14.11 ... nickelback False\n", "4 d380meD0W0M 17.14.11 ... dare False\n", "\n", "[5 rows x 20 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 24 } ] }, { "cell_type": "code", "metadata": { "id": "_2jB31r5H-DN" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }