{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Week12_Assignment.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "5vfSwXmsxAMo",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "fe68b61c-5f6e-48ff-c524-3026dc54d539"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/data/')\n",
        "data_dir = '/data/My Drive/Colab Notebooks/Experiment'\n",
        "!ls '/data/My Drive/Colab Notebooks/Experiment'\n",
        "!pip install matplotlib"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mounted at /data/\n",
            "diamonds.csv  m_data.csv     TSLA.csv\t   w_data.csv\n",
            "Iris.csv      news_data.csv  USvideos.csv\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.6/dist-packages (3.2.2)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.3.1)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.4.7)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (0.10.0)\n",
            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (2.8.1)\n",
            "Requirement already satisfied: numpy>=1.11 in /usr/local/lib/python3.6/dist-packages (from matplotlib) (1.18.5)\n",
            "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from cycler>=0.10->matplotlib) (1.15.0)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "dkvvea3LO_S1",
        "outputId": "afa6c061-5ef1-481c-b689-8df472afee7d"
      },
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.read_csv(data_dir + '/USvideos.csv')\n",
        "print(df.shape)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "(40949, 16)\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "of76mCxFPDFq"
      },
      "source": [
        " df['text'] = df['title']+df['tags']+df['channel_title']"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "DJ7HlXeoWkqp",
        "outputId": "97b83fdb-a8dc-47b9-b853-8a1cdd30ad89"
      },
      "source": [
        "import nltk\n",
        "from nltk.stem import *\n",
        "nltk.download('punkt')\n",
        "from nltk.tokenize import RegexpTokenizer\n",
        "import pandas as pd\n",
        "from tqdm.notebook import tqdm\n",
        "tqdm.pandas()\n",
        "from functools import reduce\n",
        "import re"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Unzipping tokenizers/punkt.zip.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ro6ryJpWXmTd",
        "outputId": "b58c0543-c9ce-4f33-9894-c33e00987568"
      },
      "source": [
        "import nltk\n",
        "nltk.download('stopwords')\n",
        "\n",
        "from nltk.corpus import stopwords\n",
        "stopwords.words('english')\n",
        "\n",
        "en_stops = set(stopwords.words('english'))"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/stopwords.zip.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "-Ual81F5Xivs",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "cc04fdb9-a4f1-4625-e5f9-b013de0e6388"
      },
      "source": [
        "df.isnull().sum()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "video_id                    0\n",
              "trending_date               0\n",
              "title                       0\n",
              "channel_title               0\n",
              "category_id                 0\n",
              "publish_time                0\n",
              "tags                        0\n",
              "views                       0\n",
              "likes                       0\n",
              "dislikes                    0\n",
              "comment_count               0\n",
              "thumbnail_link              0\n",
              "comments_disabled           0\n",
              "ratings_disabled            0\n",
              "video_error_or_removed      0\n",
              "description               570\n",
              "text                        0\n",
              "dtype: int64"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 6
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bPCTgXhoc7wJ",
        "outputId": "7f2f55bb-0e3f-499f-f5d4-2e97e39925c4"
      },
      "source": [
        "from nltk.corpus import stopwords\n",
        "nltk.download('wordnet') \n",
        "\n",
        "from nltk.stem import WordNetLemmatizer\n",
        "wordnet_lemmatizer = WordNetLemmatizer()\n",
        "\n",
        "def process(line):\n",
        "    return([wordnet_lemmatizer.lemmatize(t) for t in tokenizer.tokenize(line) if t not in en_stops])\n",
        "    \n",
        "\n",
        "tokenizer = RegexpTokenizer(r'\\w+')\n",
        "df['tokens']=df['text'].str.lower().apply(process)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
            "[nltk_data]   Unzipping corpora/wordnet.zip.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "q049P2LVdHUR",
        "outputId": "4606b62d-5617-474e-fbe7-38784b9a5650"
      },
      "source": [
        "df.info()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "<class 'pandas.core.frame.DataFrame'>\n",
            "RangeIndex: 40949 entries, 0 to 40948\n",
            "Data columns (total 18 columns):\n",
            " #   Column                  Non-Null Count  Dtype \n",
            "---  ------                  --------------  ----- \n",
            " 0   video_id                40949 non-null  object\n",
            " 1   trending_date           40949 non-null  object\n",
            " 2   title                   40949 non-null  object\n",
            " 3   channel_title           40949 non-null  object\n",
            " 4   category_id             40949 non-null  int64 \n",
            " 5   publish_time            40949 non-null  object\n",
            " 6   tags                    40949 non-null  object\n",
            " 7   views                   40949 non-null  int64 \n",
            " 8   likes                   40949 non-null  int64 \n",
            " 9   dislikes                40949 non-null  int64 \n",
            " 10  comment_count           40949 non-null  int64 \n",
            " 11  thumbnail_link          40949 non-null  object\n",
            " 12  comments_disabled       40949 non-null  bool  \n",
            " 13  ratings_disabled        40949 non-null  bool  \n",
            " 14  video_error_or_removed  40949 non-null  bool  \n",
            " 15  description             40379 non-null  object\n",
            " 16  text                    40949 non-null  object\n",
            " 17  tokens                  40949 non-null  object\n",
            "dtypes: bool(3), int64(5), object(10)\n",
            "memory usage: 4.8+ MB\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "bzpR0BsSdS9Q",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 439
        },
        "outputId": "790275cc-8928-4796-e1f0-2462e0eaa660"
      },
      "source": [
        "from nltk import FreqDist\n",
        "\n",
        "vectors = pd.DataFrame()\n",
        "for row in df.head(250)['tokens']:\n",
        "    vectors = vectors.append(dict(FreqDist(row)),ignore_index=True)\n",
        "\n",
        "vectors"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>marriageshantell</th>\n",
              "      <th>martincaseyneistat</th>\n",
              "      <th>talk</th>\n",
              "      <th>want</th>\n",
              "      <th>donald</th>\n",
              "      <th>hbo</th>\n",
              "      <th>john</th>\n",
              "      <th>last</th>\n",
              "      <th>lastweektonight</th>\n",
              "      <th>oliver</th>\n",
              "      <th>presidency</th>\n",
              "      <th>tonight</th>\n",
              "      <th>trump</th>\n",
              "      <th>week</th>\n",
              "      <th>alesso</th>\n",
              "      <th>anitta</th>\n",
              "      <th>anwar</th>\n",
              "      <th>bach</th>\n",
              "      <th>bear</th>\n",
              "      <th>black</th>\n",
              "      <th>brazil</th>\n",
              "      <th>driver</th>\n",
              "      <th>getting</th>\n",
              "      <th>hannahstocking</th>\n",
              "      <th>inanna</th>\n",
              "      <th>iphone</th>\n",
              "      <th>king</th>\n",
              "      <th>lele</th>\n",
              "      <th>lelepons</th>\n",
              "      <th>license</th>\n",
              "      <th>love</th>\n",
              "      <th>mancuso</th>\n",
              "      <th>music</th>\n",
              "      <th>official</th>\n",
              "      <th>pineapple</th>\n",
              "      <th>pons</th>\n",
              "      <th>ponsracist</th>\n",
              "      <th>poo</th>\n",
              "      <th>racist</th>\n",
              "      <th>rudy</th>\n",
              "      <th>...</th>\n",
              "      <th>sacred</th>\n",
              "      <th>bezos</th>\n",
              "      <th>ceo</th>\n",
              "      <th>community</th>\n",
              "      <th>festival</th>\n",
              "      <th>gathering</th>\n",
              "      <th>growing</th>\n",
              "      <th>la17</th>\n",
              "      <th>rare</th>\n",
              "      <th>richest</th>\n",
              "      <th>successsummit</th>\n",
              "      <th>summit</th>\n",
              "      <th>17taylor</th>\n",
              "      <th>reputationswift</th>\n",
              "      <th>target</th>\n",
              "      <th>taylorswift</th>\n",
              "      <th>taylurking</th>\n",
              "      <th>cant</th>\n",
              "      <th>dunkfest</th>\n",
              "      <th>fuckin</th>\n",
              "      <th>guard</th>\n",
              "      <th>reed</th>\n",
              "      <th>saying</th>\n",
              "      <th>willie</th>\n",
              "      <th>100</th>\n",
              "      <th>breath</th>\n",
              "      <th>canbreath</th>\n",
              "      <th>hold</th>\n",
              "      <th>holding</th>\n",
              "      <th>watchcut</th>\n",
              "      <th>earl</th>\n",
              "      <th>grey</th>\n",
              "      <th>linecupcakes</th>\n",
              "      <th>macarons</th>\n",
              "      <th>makrides</th>\n",
              "      <th>pastry</th>\n",
              "      <th>pipe</th>\n",
              "      <th>scran</th>\n",
              "      <th>scranline</th>\n",
              "      <th>vanilla</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>5.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>4.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>4.0</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>2.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>...</th>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "      <td>...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>245</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>246</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>2.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>247</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>248</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>1.0</td>\n",
              "      <td>2.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>249</th>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>...</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "<p>250 rows × 3135 columns</p>\n",
              "</div>"
            ],
            "text/plain": [
              "     marriageshantell  martincaseyneistat  talk  ...  scran  scranline  vanilla\n",
              "0                 1.0                 1.0   1.0  ...    NaN        NaN      NaN\n",
              "1                 NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "2                 NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "3                 NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "4                 NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "..                ...                 ...   ...  ...    ...        ...      ...\n",
              "245               NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "246               NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "247               NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "248               NaN                 NaN   NaN  ...    3.0        1.0      2.0\n",
              "249               NaN                 NaN   NaN  ...    NaN        NaN      NaN\n",
              "\n",
              "[250 rows x 3135 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qv97TQyi_lDu",
        "outputId": "8ee47fb9-82a5-4d03-bd52-8d1f3a6553ee"
      },
      "source": [
        "vectors.count()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "marriageshantell       2\n",
              "martincaseyneistat     2\n",
              "talk                  19\n",
              "want                   5\n",
              "donald                 3\n",
              "                      ..\n",
              "pastry                 1\n",
              "pipe                   1\n",
              "scran                  1\n",
              "scranline              1\n",
              "vanilla                1\n",
              "Length: 3135, dtype: int64"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kcxxEtnj-5uC"
      },
      "source": [
        "from math import log\n",
        "\n",
        "log_tf = pd.DataFrame()\n",
        "for c in vectors.columns:\n",
        "    log_tf[c]=vectors[c].apply(lambda x: 1+log(x) if x>0 else 0)\n",
        "\n",
        "count = vectors.count().iloc[0]\n",
        "data = pd.DataFrame()\n",
        "for c in vectors.columns:\n",
        "    data[c]=vectors[vectors[c]>0].count().apply(lambda x: 1+log(count/x) if x>0 else 0)\n",
        "data = data.iloc[0]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 282
        },
        "id": "LkwytAkt_5h9",
        "outputId": "03e4aef2-3c31-4d3a-b7a9-0360cb8f9445"
      },
      "source": [
        "data.hist()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<matplotlib.axes._subplots.AxesSubplot at 0x7f9508b69da0>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 15
        },
        {
          "output_type": "display_data",
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAS9UlEQVR4nO3dcaxe9X3f8fcnOKQsZIGU9soz3sxUR6tTVMKuCFWm7VJWMFSKU62LjGhxUjRXLUzthqaR7g+yMKREmxMpiNI6wopT0TisbWYr9YZcyhXKNBNMQzGGUW7BKfZcvMbE7Q0rm7Pv/niO0yfevb7Pvc9zn5vb3/slXd1zfud3zvl97zWf59zfOc9DqgpJUhvestIDkCSNj6EvSQ0x9CWpIYa+JDXE0JekhqxZ6QGcz2WXXVYbNmxY8v7f+ta3ePvb3z66Aa0CrdXcWr1gza0Ypuann376z6rqB+ba9j0d+hs2bODQoUNL3n96epqpqanRDWgVaK3m1uoFa27FMDUn+fp825zekaSGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0JekhnxPvyN3WIePn+bDd//u2M979BM/OfZzStIgvNKXpIYY+pLUEENfkhpi6EtSQxYM/STfl+SrSf4wyZEk/7ZrvyLJk0lmknwxyYVd+9u69Zlu+4a+Y320a38xyY3LVZQkaW6DXOm/Cfx4Vf0ocBWwOcm1wCeBT1fVDwGvA7d3/W8HXu/aP931I8kmYCvwHmAz8KtJLhhlMZKk81sw9Ktntlt9a/dVwI8Dv9W17wY+2C1v6dbptl+fJF37nqp6s6peAWaAa0ZShSRpIAM9p99dkT8N/BDwAPDHwDer6kzX5RiwrlteB7wKUFVnkpwGvr9rP9h32P59+s+1HdgOMDExwfT09OIq6jNxEdx15ZmFO47YMGMe1uzs7Iqef9xaqxesuRXLVfNAoV9V3wauSnIJ8CXg7418JH91rp3AToDJycka5n+Rdv/De9lxePzvPzt669TYz3lWa/9budbqBWtuxXLVvKind6rqm8DjwI8BlyQ5m6iXA8e75ePAeoBu+zuBb/S3z7GPJGkMBnl65we6K3ySXAT8BPACvfD/6a7bNmBvt7yvW6fb/vtVVV371u7pniuAjcBXR1WIJGlhg8x9rAV2d/P6bwEeqaovJ3ke2JPk3wFfAx7q+j8E/EaSGeAUvSd2qKojSR4BngfOAHd000aSpDFZMPSr6lngvXO0v8wcT99U1V8C/3SeY90H3Lf4YUqSRsF35EpSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhqyYOgnWZ/k8STPJzmS5Je69o8lOZ7kme7r5r59PppkJsmLSW7sa9/ctc0kuXt5SpIkzWfNAH3OAHdV1R8keQfwdJID3bZPV9V/6O+cZBOwFXgP8LeA30vy7m7zA8BPAMeAp5Lsq6rnR1GIJGlhC4Z+VZ0ATnTLf5HkBWDdeXbZAuypqjeBV5LMANd022aq6mWAJHu6voa+JI3JIFf635FkA/Be4Eng/cCdSW4DDtH7a+B1ei8IB/t2O8ZfvUi8ek77++Y4x3ZgO8DExATT09OLGeJ3mbgI7rryzJL3X6phxjys2dnZFT3/uLVWL1hzK5ar5oFDP8nFwG8Dv1xVf57kQeBeoLrvO4CfG3ZAVbUT2AkwOTlZU1NTSz7W/Q/vZcfhRb2ujcTRW6fGfs6zpqenGeZnttq0Vi9YcyuWq+aBEjHJW+kF/sNV9TsAVfVa3/bPAl/uVo8D6/t2v7xr4zztkqQxGOTpnQAPAS9U1af62tf2dfsp4LlueR+wNcnbklwBbAS+CjwFbExyRZIL6d3s3TeaMiRJgxjkSv/9wM8Ch5M807X9CnBLkqvoTe8cBX4eoKqOJHmE3g3aM8AdVfVtgCR3Ao8CFwC7qurICGuRJC1gkKd3vgJkjk37z7PPfcB9c7TvP99+kqTl5TtyJakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDVkw9JOsT/J4kueTHEnyS137u5IcSPJS9/3Srj1JPpNkJsmzSa7uO9a2rv9LSbYtX1mSpLkMcqV/BrirqjYB1wJ3JNkE3A08VlUbgce6dYCbgI3d13bgQei9SAD3AO8DrgHuOftCIUkajwVDv6pOVNUfdMt/AbwArAO2ALu7bruBD3bLW4DPV89B4JIka4EbgQNVdaqqXgcOAJtHWo0k6bwWNaefZAPwXuBJYKKqTnSb/hSY6JbXAa/27Xasa5uvXZI0JmsG7ZjkYuC3gV+uqj9P8p1tVVVJahQDSrKd3rQQExMTTE9PL/lYExfBXVeeGcWwFmWYMQ9rdnZ2Rc8/bq3VC9bciuWqeaDQT/JWeoH/cFX9Ttf8WpK1VXWim7452bUfB9b37X5513YcmDqnffrcc1XVTmAnwOTkZE1NTZ3bZWD3P7yXHYcHfl0bmaO3To39nGdNT08zzM9stWmtXrDmVixXzYM8vRPgIeCFqvpU36Z9wNkncLYBe/vab+ue4rkWON1NAz0K3JDk0u4G7g1dmyRpTAa5DH4/8LPA4STPdG2/AnwCeCTJ7cDXgQ912/YDNwMzwBvARwCq6lSSe4Gnun4fr6pTI6lCkjSQBUO/qr4CZJ7N18/Rv4A75jnWLmDXYgYoSRod35ErSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyIKhn2RXkpNJnutr+1iS40me6b5u7tv20SQzSV5McmNf++aubSbJ3aMvRZK0kEGu9D8HbJ6j/dNVdVX3tR8gySZgK/Cebp9fTXJBkguAB4CbgE3ALV1fSdIYrVmoQ1U9kWTDgMfbAuypqjeBV5LMANd022aq6mWAJHu6vs8vesSSpCUbZk7/ziTPdtM/l3Zt64BX+/oc69rma5ckjdGCV/rzeBC4F6ju+w7g50YxoCTbge0AExMTTE9PL/lYExfBXVeeGcWwFmWYMQ9rdnZ2Rc8/bq3VC9bciuWqeUmhX1WvnV1O8lngy93qcWB9X9fLuzbO037usXcCOwEmJydrampqKUME4P6H97Lj8FJf15bu6K1TYz/nWdPT0wzzM1ttWqsXrLkVy1XzkqZ3kqztW/0p4OyTPfuArUneluQKYCPwVeApYGOSK5JcSO9m776lD1uStBQLXgYn+QIwBVyW5BhwDzCV5Cp60ztHgZ8HqKojSR6hd4P2DHBHVX27O86dwKPABcCuqjoy8mokSec1yNM7t8zR/NB5+t8H3DdH+35g/6JGJ0kaKd+RK0kNMfQlqSGGviQ1xNCXpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JasiCoZ9kV5KTSZ7ra3tXkgNJXuq+X9q1J8lnkswkeTbJ1X37bOv6v5Rk2/KUI0k6n0Gu9D8HbD6n7W7gsaraCDzWrQPcBGzsvrYDD0LvRQK4B3gfcA1wz9kXCknS+CwY+lX1BHDqnOYtwO5ueTfwwb72z1fPQeCSJGuBG4EDVXWqql4HDvD/v5BIkpbZUuf0J6rqRLf8p8BEt7wOeLWv37Gubb52SdIYrRn2AFVVSWoUgwFIsp3e1BATExNMT08v+VgTF8FdV54Z0cgGN8yYhzU7O7ui5x+31uoFa27FctW81NB/LcnaqjrRTd+c7NqPA+v7+l3etR0Hps5pn57rwFW1E9gJMDk5WVNTU3N1G8j9D+9lx+GhX9cW7eitU2M/51nT09MM8zNbbVqrF6y5FctV81Knd/YBZ5/A2Qbs7Wu/rXuK51rgdDcN9ChwQ5JLuxu4N3RtkqQxWvAyOMkX6F2lX5bkGL2ncD4BPJLkduDrwIe67vuBm4EZ4A3gIwBVdSrJvcBTXb+PV9W5N4clSctswdCvqlvm2XT9HH0LuGOe4+wCdi1qdJKkkfIduZLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1xNCXpIYY+pLUkKFCP8nRJIeTPJPkUNf2riQHkrzUfb+0a0+SzySZSfJskqtHUYAkaXCjuNK/rqquqqrJbv1u4LGq2gg81q0D3ARs7L62Aw+O4NySpEVYjumdLcDubnk38MG+9s9Xz0HgkiRrl+H8kqR5pKqWvnPyCvA6UMCvV9XOJN+sqku67QFer6pLknwZ+ERVfaXb9hjwr6vq0DnH3E7vLwEmJib+/p49e5Y8vpOnTvPa/1ry7kt25bp3jv+kndnZWS6++OIVO/+4tVYvWHMrhqn5uuuue7pv9uW7rBlqVPAPqup4kh8EDiT57/0bq6qSLOpVpap2AjsBJicna2pqasmDu//hvew4PGyJi3f01qmxn/Os6elphvmZrTat1QvW3Irlqnmo6Z2qOt59Pwl8CbgGeO3stE33/WTX/Tiwvm/3y7s2SdKYLDn0k7w9yTvOLgM3AM8B+4BtXbdtwN5ueR9wW/cUz7XA6ao6seSRS5IWbZi5jwngS71pe9YAv1lV/yXJU8AjSW4Hvg58qOu/H7gZmAHeAD4yxLklSUuw5NCvqpeBH52j/RvA9XO0F3DHUs8nSRqe78iVpIYY+pLUEENfkhpi6EtSQwx9SWqIoS9JDTH0Jakhhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqiKEvSQ0x9CWpIYa+JDXE0Jekhhj6ktQQQ1+SGmLoS1JDDH1JaoihL0kNMfQlqSGGviQ1ZOyhn2RzkheTzCS5e9znl6SWjTX0k1wAPADcBGwCbkmyaZxjkKSWjftK/xpgpqperqr/DewBtox5DJLUrDVjPt864NW+9WPA+/o7JNkObO9WZ5O8OMT5LgP+bIj9lySfHPcZv8uK1LyCWqsXrLkVw9T8d+bbMO7QX1BV7QR2juJYSQ5V1eQojrVatFZza/WCNbdiuWoe9/TOcWB93/rlXZskaQzGHfpPARuTXJHkQmArsG/MY5CkZo11eqeqziS5E3gUuADYVVVHlvGUI5kmWmVaq7m1esGaW7EsNaeqluO4kqTvQb4jV5IaYuhLUkNWfegv9LEOSd6W5Ivd9ieTbBj/KEdrgJr/ZZLnkzyb5LEk8z6zu1oM+vEdSf5Jkkqy6h/vG6TmJB/qftdHkvzmuMc4agP82/7bSR5P8rXu3/fNKzHOUUmyK8nJJM/Nsz1JPtP9PJ5NcvXQJ62qVftF72bwHwN/F7gQ+ENg0zl9fhH4tW55K/DFlR73GGq+Dvgb3fIvtFBz1+8dwBPAQWBypcc9ht/zRuBrwKXd+g+u9LjHUPNO4Be65U3A0ZUe95A1/0PgauC5ebbfDPxnIMC1wJPDnnO1X+kP8rEOW4Dd3fJvAdcnyRjHOGoL1lxVj1fVG93qQXrvh1jNBv34jnuBTwJ/Oc7BLZNBav5nwANV9TpAVZ0c8xhHbZCaC/ib3fI7gf8xxvGNXFU9AZw6T5ctwOer5yBwSZK1w5xztYf+XB/rsG6+PlV1BjgNfP9YRrc8Bqm53+30rhRWswVr7v7sXV9VvzvOgS2jQX7P7wbeneS/JjmYZPPYRrc8Bqn5Y8DPJDkG7Af++XiGtmIW+9/7gr7nPoZBo5PkZ4BJ4B+t9FiWU5K3AJ8CPrzCQxm3NfSmeKbo/TX3RJIrq+qbKzqq5XUL8Lmq2pHkx4DfSPIjVfV/V3pgq8Vqv9If5GMdvtMnyRp6fxJ+YyyjWx4DfZRFkn8M/BvgA1X15pjGtlwWqvkdwI8A00mO0pv73LfKb+YO8ns+Buyrqv9TVa8Af0TvRWC1GqTm24FHAKrqvwHfR++Dyf66GvlH16z20B/kYx32Adu65Z8Gfr+6OySr1II1J3kv8Ov0An+1z/PCAjVX1emquqyqNlTVBnr3MT5QVYdWZrgjMci/7f9E7yqfJJfRm+55eZyDHLFBav4T4HqAJD9ML/T/51hHOV77gNu6p3iuBU5X1YlhDriqp3dqno91SPJx4FBV7QMeovcn4Ay9GyZbV27Ewxuw5n8PXAz8x+6e9Z9U1QdWbNBDGrDmv1YGrPlR4IYkzwPfBv5VVa3av2IHrPku4LNJ/gW9m7ofXs0XcUm+QO+F+7LuPsU9wFsBqurX6N23uBmYAd4APjL0OVfxz0uStEirfXpHkrQIhr4kNcTQl6SGGPqS1BBDX5IaYuhLUkMMfUlqyP8Dvlr/cjC7XHoAAAAASUVORK5CYII=\n",
            "text/plain": [
              "<Figure size 432x288 with 1 Axes>"
            ]
          },
          "metadata": {
            "tags": [],
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 282
        },
        "id": "-x92m5SxAtyi",
        "outputId": "9917e1eb-5855-4b00-d703-dda07267e1d5"
      },
      "source": [
        "tf_count = pd.DataFrame()\n",
        "for c in vectors.columns:\n",
        "    tf_count=vectors[vectors[c]>0].sum()\n",
        "tf_count.hist()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<matplotlib.axes._subplots.AxesSubplot at 0x7f95085f89e8>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        },
        {
          "output_type": "display_data",
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAR3klEQVR4nO3dcYxd5Xnn8e+vQFJE0oWU7MhrWzVSvbsiRSXRiLBKtZpNFDCkKlTajUBs4qZI7h8gJVpLK9J/aJsipdKSrIJSJHexSna9oahJZCtBpV7KVRRpCeCUYAylTIkjbBGsFkIyiTYrp0//mHfSWzr2zNx750487/cjXd1zn/Oec95nEL97OPfcS6oKSVIffmajJyBJmh5DX5I6YuhLUkcMfUnqiKEvSR05f6MncDaXXnpp7dixY+Ttf/CDH3DRRRdNbkLngN567q1fsOdejNPzkSNH/raq3r7cup/q0N+xYwdPPvnkyNsPBgPm5uYmN6FzQG8999Yv2HMvxuk5ybfPtM7LO5LUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JGf6m/kjuvoydf5jTu+MvXjHv/kB6Z+TElaDc/0Jakjhr4kdcTQl6SOrBj6SX42yeNJvpnkWJLfbfXLknw9yXySP0nyplZ/c3s939bvGNrXx1v9+STXrldTkqTlreZM/0fAe6vql4ErgV1Jrgb+APh0Vf0i8Bpwaxt/K/Baq3+6jSPJ5cBNwDuAXcAfJjlvks1Iks5uxdCvRQvt5QXtUcB7gT9t9fuBG9vyDe01bf37kqTVH6iqH1XVt4B54KqJdCFJWpVV3bLZzsiPAL8IfBb4G+C7VXW6DTkBbG3LW4GXAKrqdJLXgZ9v9ceGdju8zfCx9gB7AGZmZhgMBmvraMjMhbD3itMrD5ywceY8roWFhQ09/rT11i/Ycy/Wq+dVhX5V/Ri4MsnFwJeAfzvxmfzjsfYB+wBmZ2drnP9bzj0HDnL30el/FeH4LXNTP+aS3v4PQ731C/bci/XqeU1371TVd4FHgX8HXJxkKVG3ASfb8klgO0Bb/y+AvxuuL7ONJGkKVnP3ztvbGT5JLgTeDzzHYvj/xzZsN3CwLR9qr2nr/6KqqtVvanf3XAbsBB6fVCOSpJWt5trHFuD+dl3/Z4AHq+rLSZ4FHkjy+8BfAve18fcB/zPJPPAqi3fsUFXHkjwIPAucBm5rl40kSVOyYuhX1dPAO5epv8gyd99U1f8D/tMZ9nUXcNfapylJmgS/kStJHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SerIiqGfZHuSR5M8m+RYko+2+u8kOZnkqfa4fmibjyeZT/J8kmuH6rtabT7JHevTkiTpTM5fxZjTwN6q+kaStwJHkhxu6z5dVf9teHCSy4GbgHcA/wr4P0n+dVv9WeD9wAngiSSHqurZSTQiSVrZiqFfVS8DL7fl7yd5Dth6lk1uAB6oqh8B30oyD1zV1s1X1YsASR5oYw19SZqSNV3TT7IDeCfw9Va6PcnTSfYnuaTVtgIvDW12otXOVJckTclqLu8AkOQtwBeAj1XV95LcC3wCqPZ8N/Cb404oyR5gD8DMzAyDwWDkfc1cCHuvOD3ulNZsnDmPa2FhYUOPP2299Qv23Iv16nlVoZ/kAhYD/0BVfRGgql4ZWv9HwJfby5PA9qHNt7UaZ6n/RFXtA/YBzM7O1tzc3GqmuKx7Dhzk7qOrfl+bmOO3zE39mEsGgwHj/M3ONb31C/bci/XqeTV37wS4D3iuqj41VN8yNOzXgWfa8iHgpiRvTnIZsBN4HHgC2JnksiRvYvHD3kOTaUOStBqrOQ1+D/Ah4GiSp1rtt4Gbk1zJ4uWd48BvAVTVsSQPsvgB7Wngtqr6MUCS24GHgfOA/VV1bIK9SJJWsJq7d74GZJlVD51lm7uAu5apP3S27SRJ68tv5EpSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR1YM/STbkzya5Nkkx5J8tNXfluRwkhfa8yWtniSfSTKf5Okk7xra1+42/oUku9evLUnSclZzpn8a2FtVlwNXA7cluRy4A3ikqnYCj7TXANcBO9tjD3AvLL5JAHcC7wauAu5ceqOQJE3HiqFfVS9X1Tfa8veB54CtwA3A/W3Y/cCNbfkG4HO16DHg4iRbgGuBw1X1alW9BhwGdk20G0nSWZ2/lsFJdgDvBL4OzFTVy23Vd4CZtrwVeGlosxOtdqb6G4+xh8X/QmBmZobBYLCWKf4TMxfC3itOj7z9qMaZ87gWFhY29PjT1lu/YM+9WK+eVx36Sd4CfAH4WFV9L8lP1lVVJalJTKiq9gH7AGZnZ2tubm7kfd1z4CB3H13T+9pEHL9lburHXDIYDBjnb3au6a1fsOderFfPq7p7J8kFLAb+gar6Yiu/0i7b0J5PtfpJYPvQ5tta7Ux1SdKUrObunQD3Ac9V1aeGVh0Clu7A2Q0cHKp/uN3FczXwersM9DBwTZJL2ge417SaJGlKVnPt4z3Ah4CjSZ5qtd8GPgk8mORW4NvAB9u6h4DrgXngh8BHAKrq1SSfAJ5o436vql6dSBeSpFVZMfSr6mtAzrD6fcuML+C2M+xrP7B/LROUJE2O38iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1ZMXQT7I/yakkzwzVfifJySRPtcf1Q+s+nmQ+yfNJrh2q72q1+SR3TL4VSdJKVnOm/8fArmXqn66qK9vjIYAklwM3Ae9o2/xhkvOSnAd8FrgOuBy4uY2VJE3R+SsNqKqvJtmxyv3dADxQVT8CvpVkHriqrZuvqhcBkjzQxj675hlLkka2Yuifxe1JPgw8CeytqteArcBjQ2NOtBrAS2+ov3u5nSbZA+wBmJmZYTAYjDzBmQth7xWnR95+VOPMeVwLCwsbevxp661fsOderFfPo4b+vcAngGrPdwO/OYkJVdU+YB/A7Oxszc3Njbyvew4c5O6j47yvjeb4LXNTP+aSwWDAOH+zc01v/YI992K9eh4pEavqlaXlJH8EfLm9PAlsHxq6rdU4S12SNCUj3bKZZMvQy18Hlu7sOQTclOTNSS4DdgKPA08AO5NcluRNLH7Ye2j0aUuSRrHimX6SzwNzwKVJTgB3AnNJrmTx8s5x4LcAqupYkgdZ/ID2NHBbVf247ed24GHgPGB/VR2beDeSpLNazd07Ny9Tvu8s4+8C7lqm/hDw0JpmJ0maKL+RK0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6siKoZ9kf5JTSZ4Zqr0tyeEkL7TnS1o9ST6TZD7J00neNbTN7jb+hSS716cdSdLZrOZM/4+BXW+o3QE8UlU7gUfaa4DrgJ3tsQe4FxbfJIA7gXcDVwF3Lr1RSJKmZ8XQr6qvAq++oXwDcH9bvh+4caj+uVr0GHBxki3AtcDhqnq1ql4DDvPP30gkSevs/BG3m6mql9vyd4CZtrwVeGlo3IlWO1P9n0myh8X/SmBmZobBYDDiFGHmQth7xemRtx/VOHMe18LCwoYef9p66xfsuRfr1fOoof8TVVVJahKTafvbB+wDmJ2drbm5uZH3dc+Bg9x9dOwW1+z4LXNTP+aSwWDAOH+zc01v/YI992K9eh717p1X2mUb2vOpVj8JbB8at63VzlSXJE3RqKF/CFi6A2c3cHCo/uF2F8/VwOvtMtDDwDVJLmkf4F7TapKkKVrx2keSzwNzwKVJTrB4F84ngQeT3Ap8G/hgG/4QcD0wD/wQ+AhAVb2a5BPAE23c71XVGz8cliStsxVDv6puPsOq9y0ztoDbzrCf/cD+Nc1OkjRRfiNXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkbFCP8nxJEeTPJXkyVZ7W5LDSV5oz5e0epJ8Jsl8kqeTvGsSDUiSVm8SZ/r/oaqurKrZ9voO4JGq2gk80l4DXAfsbI89wL0TOLYkaQ3W4/LODcD9bfl+4Mah+udq0WPAxUm2rMPxJUlnMG7oF/DnSY4k2dNqM1X1clv+DjDTlrcCLw1te6LVJElTcv6Y2/9KVZ1M8i+Bw0n+anhlVVWSWssO25vHHoCZmRkGg8HIk5u5EPZecXrk7Uc1zpzHtbCwsKHHn7be+gV77sV69TxW6FfVyfZ8KsmXgKuAV5JsqaqX2+WbU234SWD70ObbWu2N+9wH7AOYnZ2tubm5ked3z4GD3H103Pe1tTt+y9zUj7lkMBgwzt/sXNNbv2DPvVivnke+vJPkoiRvXVoGrgGeAQ4Bu9uw3cDBtnwI+HC7i+dq4PWhy0CSpCkY5zR4BvhSkqX9/O+q+rMkTwAPJrkV+DbwwTb+IeB6YB74IfCRMY4tSRrByKFfVS8Cv7xM/e+A9y1TL+C2UY8nSRqf38iVpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI5MPfST7EryfJL5JHdM+/iS1LOphn6S84DPAtcBlwM3J7l8mnOQpJ6dP+XjXQXMV9WLAEkeAG4Anp3yPDatoydf5zfu+MrUj3v8kx+Y+jElrd20Q38r8NLQ6xPAu4cHJNkD7GkvF5I8P8bxLgX+doztR5I/mPYR/4neet6QfjeYPfdhnJ5/4Uwrph36K6qqfcC+SewryZNVNTuJfZ0reuu5t37BnnuxXj1P+4Pck8D2odfbWk2SNAXTDv0ngJ1JLkvyJuAm4NCU5yBJ3Zrq5Z2qOp3kduBh4Dxgf1UdW8dDTuQy0Tmmt5576xfsuRfr0nOqaj32K0n6KeQ3ciWpI4a+JHVkU4Z+bz/1kGR/klNJntnouUxLku1JHk3ybJJjST660XNab0l+NsnjSb7Zev7djZ7TNCQ5L8lfJvnyRs9lWpIcT3I0yVNJnpzovjfbNf32Uw9/DbyfxS9/PQHcXFWb9lu/Sf49sAB8rqp+aaPnMw1JtgBbquobSd4KHAFu3OT/nANcVFULSS4AvgZ8tKoe2+Cprask/wWYBX6uqn51o+czDUmOA7NVNfEvpG3GM/2f/NRDVf1/YOmnHjatqvoq8OpGz2OaqurlqvpGW/4+8ByL3/jetGrRQnt5QXtsrrO2N0iyDfgA8D82ei6bxWYM/eV+6mFTh0HvkuwA3gl8fWNnsv7apY6ngFPA4ara7D3/d+C/An+/0ROZsgL+PMmR9tM0E7MZQ18dSfIW4AvAx6rqexs9n/VWVT+uqitZ/Db7VUk27eW8JL8KnKqqIxs9lw3wK1X1LhZ/kfi2dgl3IjZj6PtTD51o17W/AByoqi9u9Hymqaq+CzwK7Nrouayj9wC/1q5vPwC8N8n/2tgpTUdVnWzPp4AvsXjZeiI2Y+j7Uw8daB9q3gc8V1Wf2uj5TEOStye5uC1fyOLNCn+1sbNaP1X18araVlU7WPz3+C+q6j9v8LTWXZKL2s0JJLkIuAaY2J15my70q+o0sPRTD88BD67zTz1suCSfB/4v8G+SnEhy60bPaQreA3yIxbO/p9rj+o2e1DrbAjya5GkWT24OV1U3tzF2ZAb4WpJvAo8DX6mqP5vUzjfdLZuSpDPbdGf6kqQzM/QlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR/4B7YmFZUWg8TsAAAAASUVORK5CYII=\n",
            "text/plain": [
              "<Figure size 432x288 with 1 Axes>"
            ]
          },
          "metadata": {
            "tags": [],
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 282
        },
        "id": "fBtA8AL0A7X_",
        "outputId": "9788036c-dafc-45f1-ac52-cc0b9508b03c"
      },
      "source": [
        "tf_weighed = pd.DataFrame()\n",
        "for c in log_tf.columns:\n",
        "    tf_weighed[c] = log_tf[c]*data[c]\n",
        "\n",
        "for c in vectors.columns:\n",
        "    tf_count=tf_weighed[tf_weighed[c]>0].sum()\n",
        "tf_count.hist()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "<matplotlib.axes._subplots.AxesSubplot at 0x7f9507bca9e8>"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        },
        {
          "output_type": "display_data",
          "data": {
            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD4CAYAAAAAczaOAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAASkklEQVR4nO3df4xl5X3f8ffHrHGRcQw27oQu2y5SNqpwabAzwkRpm3Gw+VXJEDV1QTSsHaSNFJAchUrFyR+kpkikLaYydaxswirriGZDnFi7smnpZsPIslQwkFDWC6VM8FrsFrOKwSRjGqp1v/1jnrWuyezO3Zk7Z3b2eb+kq3vOc557z/c7l/3cM2fOvaSqkCT14S1rXYAkaTiGviR1xNCXpI4Y+pLUEUNfkjqyYa0LOJHzzjuvNm/evNZlnLTvfve7vP3tb1/rMgZlz32w5/XhySef/Iuqes9i207p0N+8eTNPPPHEWpdx0mZnZ5mZmVnrMgZlz32w5/UhyTePt83TO5LUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1JFT+hO50qls/+HX+NjtXx58vwfv/qeD71OnD4/0Jakjhr4kdcTQl6SOGPqS1JElQz/J30rytST/I8mBJP+mjV+Y5LEkc0l+P8mZbfxtbX2ubd888lyfbOPPJblytZqSJC1unCP9N4CfrqofAy4BrkpyGfDrwL1V9SPAq8DNbf7NwKtt/N42jyQXAdcD7wWuAn4jyRmTbEaSdGJLhn4tmG+rb223An4a+EIb3wlc15avbeu07ZcnSRvfVVVvVNU3gDng0ol0IUkay1jn9JOckeQp4AiwF/hz4DtVdbRNOQRsbMsbgRcB2vbXgHePji/yGEnSAMb6cFZVfQ+4JMk5wBeBv79aBSXZBmwDmJqaYnZ2drV2tWrm5+fXZd0r0WPPU2fBbRcfXXrihK3lz7nH1/l06/mkPpFbVd9J8gjwE8A5STa0o/kLgMNt2mFgE3AoyQbgncC3R8aPGX3M6D62A9sBpqena739vylhff4/NVeqx57ve2A39+wf/kPtB2+cGXyfx/T4Op9uPY9z9c572hE+Sc4CPgw8CzwC/GybthXY3Zb3tHXa9j+pqmrj17erey4EtgBfm1QjkqSljXOYcj6ws11p8xbgwar6UpJngF1J/i3wZ8D9bf79wO8mmQNeYeGKHarqQJIHgWeAo8At7bSRJGkgS4Z+VT0NvG+R8RdY5Oqbqvpr4J8f57nuAu46+TIlSZPgJ3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdWTL0k2xK8kiSZ5IcSPKJNv5rSQ4neardrhl5zCeTzCV5LsmVI+NXtbG5JLevTkuSpOPZMMaco8BtVfWnSd4BPJlkb9t2b1X9h9HJSS4CrgfeC/wd4I+T/Gjb/Fngw8Ah4PEke6rqmUk0Ikla2pKhX1UvAS+15b9K8iyw8QQPuRbYVVVvAN9IMgdc2rbNVdULAEl2tbmGviQN5KTO6SfZDLwPeKwN3Zrk6SQ7kpzbxjYCL4487FAbO964JGkg45zeASDJ2cAfAr9UVX+Z5HPAnUC1+3uAn19pQUm2AdsApqammJ2dXelTDm5+fn5d1r0SPfY8dRbcdvHRwfe7lj/nHl/n063nsUI/yVtZCPwHquqPAKrq5ZHtvwV8qa0eBjaNPPyCNsYJxr+vqrYD2wGmp6drZmZmnBJPKbOzs6zHuleix57ve2A39+wf+7hpYg7eODP4Po/p8XU+3Xoe5+qdAPcDz1bVp0fGzx+Z9jPA19vyHuD6JG9LciGwBfga8DiwJcmFSc5k4Y+9eybThiRpHOMcpvwk8HPA/iRPtbFfAW5IcgkLp3cOAr8AUFUHkjzIwh9ojwK3VNX3AJLcCjwMnAHsqKoDE+xFkrSEca7e+SqQRTY9dILH3AXctcj4Qyd6nCRpdfmJXEnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqyJKhn2RTkkeSPJPkQJJPtPF3Jdmb5Pl2f24bT5LPJJlL8nSS948819Y2//kkW1evLUnSYsY50j8K3FZVFwGXAbckuQi4HdhXVVuAfW0d4GpgS7ttAz4HC28SwB3AB4BLgTuOvVFIkoaxZOhX1UtV9adt+a+AZ4GNwLXAzjZtJ3BdW74W+HwteBQ4J8n5wJXA3qp6papeBfYCV020G0nSCW04mclJNgPvAx4DpqrqpbbpW8BUW94IvDjysENt7Hjjb97HNhZ+Q2BqaorZ2dmTKfGUMD8/vy7rXokee546C267+Ojg+13Ln3OPr/Pp1vPYoZ/kbOAPgV+qqr9M8v1tVVVJahIFVdV2YDvA9PR0zczMTOJpBzU7O8t6rHsleuz5vgd2c8/+kzpumoiDN84Mvs9jenydT7eex7p6J8lbWQj8B6rqj9rwy+20De3+SBs/DGwaefgFbex445KkgYxz9U6A+4Fnq+rTI5v2AMeuwNkK7B4Zv6ldxXMZ8Fo7DfQwcEWSc9sfcK9oY5KkgYzzu+lPAj8H7E/yVBv7FeBu4MEkNwPfBD7atj0EXAPMAa8DHweoqleS3Ak83uZ9qqpemUgXkqSxLBn6VfVVIMfZfPki8wu45TjPtQPYcTIFSpImx0/kSlJHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOrJk6CfZkeRIkq+PjP1aksNJnmq3a0a2fTLJXJLnklw5Mn5VG5tLcvvkW5EkLWWcI/3fAa5aZPzeqrqk3R4CSHIRcD3w3vaY30hyRpIzgM8CVwMXATe0uZKkAW1YakJVfSXJ5jGf71pgV1W9AXwjyRxwads2V1UvACTZ1eY+c9IVS5KWbcnQP4Fbk9wEPAHcVlWvAhuBR0fmHGpjAC++afwDiz1pkm3ANoCpqSlmZ2dXUOLamJ+fX5d1r0SPPU+dBbddfHTw/a7lz7nH1/l063m5of854E6g2v09wM9PoqCq2g5sB5ienq6ZmZlJPO2gZmdnWY91r0SPPd/3wG7u2b+S46blOXjjzOD7PKbH1/l063lZ/8VW1cvHlpP8FvCltnoY2DQy9YI2xgnGJUkDWdYlm0nOH1n9GeDYlT17gOuTvC3JhcAW4GvA48CWJBcmOZOFP/buWX7ZkqTlWPJIP8nvATPAeUkOAXcAM0kuYeH0zkHgFwCq6kCSB1n4A+1R4Jaq+l57nluBh4EzgB1VdWDi3UiSTmicq3duWGT4/hPMvwu4a5Hxh4CHTqo6SdJE+YlcSeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSR5YM/SQ7khxJ8vWRsXcl2Zvk+XZ/bhtPks8kmUvydJL3jzxma5v/fJKtq9OOJOlExjnS/x3gqjeN3Q7sq6otwL62DnA1sKXdtgGfg4U3CeAO4APApcAdx94oJEnDWTL0q+orwCtvGr4W2NmWdwLXjYx/vhY8CpyT5HzgSmBvVb1SVa8Ce/mbbySSpFW2YZmPm6qql9ryt4CptrwReHFk3qE2drzxvyHJNhZ+S2BqaorZ2dlllrh25ufn12XdK9Fjz1NnwW0XHx18v2v5c+7xdT7del5u6H9fVVWSmkQx7fm2A9sBpqena2ZmZlJPPZjZ2VnWY90r0WPP9z2wm3v2r/if0Ek7eOPM4Ps8psfX+XTreblX77zcTtvQ7o+08cPAppF5F7Sx441Lkga03NDfAxy7AmcrsHtk/KZ2Fc9lwGvtNNDDwBVJzm1/wL2ijUmSBrTk76ZJfg+YAc5LcoiFq3DuBh5McjPwTeCjbfpDwDXAHPA68HGAqnolyZ3A423ep6rqzX8cliStsiVDv6puOM6myxeZW8Atx3meHcCOk6pOkjRRfiJXkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdMfQlqSOGviR1xNCXpI4Y+pLUkRWFfpKDSfYneSrJE23sXUn2Jnm+3Z/bxpPkM0nmkjyd5P2TaECSNL5JHOl/sKouqarptn47sK+qtgD72jrA1cCWdtsGfG4C+5YknYTVOL1zLbCzLe8ErhsZ/3wteBQ4J8n5q7B/SdJxpKqW/+DkG8CrQAG/WVXbk3ynqs5p2wO8WlXnJPkScHdVfbVt2wf866p64k3PuY2F3wSYmpr68V27di27vrUyPz/P2WefvdZlDKrHno+88hov/5/h93vxxncOv9Omx9d5Pfb8wQ9+8MmRsy8/YMMKn/sfVdXhJH8b2Jvkf45urKpKclLvKlW1HdgOMD09XTMzMysscXizs7Osx7pXosee73tgN/fsX+k/oZN38MaZwfd5TI+v8+nW84pO71TV4XZ/BPgicCnw8rHTNu3+SJt+GNg08vAL2pgkaSDLDv0kb0/yjmPLwBXA14E9wNY2bSuwuy3vAW5qV/FcBrxWVS8tu3JJ0klbye+mU8AXF07bswH4z1X1X5M8DjyY5Gbgm8BH2/yHgGuAOeB14OMr2LckaRmWHfpV9QLwY4uMfxu4fJHxAm5Z7v4kSSvnJ3IlqSOGviR1xNCXpI4Y+pLUEUNfkjpi6EtSRwx9SeqIoS9JHTH0Jakjhr4kdcTQl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI4a+JHXE0Jekjhj6ktQRQ1+SOmLoS1JHDH1J6oihL0kdGTz0k1yV5Lkkc0luH3r/ktSzQUM/yRnAZ4GrgYuAG5JcNGQNktSzoY/0LwXmquqFqvq/wC7g2oFrkKRubRh4fxuBF0fWDwEfGJ2QZBuwra3OJ3luoNom6TzgL9a6iIHZ80Dy60Pv8Qf4Oq8Pf+94G4YO/SVV1XZg+1rXsRJJnqiq6bWuY0j23Ad7Xv+GPr1zGNg0sn5BG5MkDWDo0H8c2JLkwiRnAtcDewauQZK6Nejpnao6muRW4GHgDGBHVR0YsoaBrOvTU8tkz32w53UuVbXWNUiSBuInciWpI4a+JHXE0J+AJO9KsjfJ8+3+3BPM/aEkh5L8pyFrnLRxek5ySZL/nuRAkqeT/Iu1qHWllvrqkCRvS/L7bftjSTYPX+XkjNHvLyd5pr2m+5Ic95rw9WLcr4dJ8s+SVJJ1ewmnoT8ZtwP7qmoLsK+tH8+dwFcGqWp1jdPz68BNVfVe4CrgPyY5Z8AaV2zMrw65GXi1qn4EuBdY249PrcCY/f4ZMF1V/xD4AvDvhq1yssb9epgk7wA+ATw2bIWTZehPxrXAzra8E7husUlJfhyYAv7bQHWtpiV7rqr/VVXPt+X/DRwB3jNYhZMxzleHjP4svgBcniQD1jhJS/ZbVY9U1ett9VEWPm+zno379TB3svCG/tdDFjdphv5kTFXVS235WywE+w9I8hbgHuBfDVnYKlqy51FJLgXOBP58tQubsMW+OmTj8eZU1VHgNeDdg1Q3eeP0O+pm4L+sakWrb8mek7wf2FRVXx6ysNVwyn0Nw6kqyR8DP7zIpl8dXamqSrLYdbC/CDxUVYfWy0HgBHo+9jznA78LbK2q/zfZKrVWkvxLYBr4qbWuZTW1A7ZPAx9b41ImwtAfU1V96Hjbkryc5PyqeqkF3JFFpv0E8I+T/CJwNnBmkvmqOmX/nwIT6JkkPwR8GfjVqnp0lUpdTeN8dcixOYeSbADeCXx7mPImbqyvSknyIRbe/H+qqt4YqLbVslTP7wD+ATDbDth+GNiT5CNV9cRgVU6Ip3cmYw+wtS1vBXa/eUJV3VhVf7eqNrNwiufzp3Lgj2HJnttXbXyRhV6/MGBtkzTOV4eM/ix+FviTWr+felyy3yTvA34T+EhVLfpmv86csOeqeq2qzquqze3f76Ms9L7uAh8M/Um5G/hwkueBD7V1kkwn+e01rWz1jNPzR4F/AnwsyVPtdsnalLs87Rz9sa8OeRZ4sKoOJPlUko+0afcD704yB/wyJ75665Q2Zr//noXfVv+gvabr+vuzxuz5tOHXMEhSRzzSl6SOGPqS1BFDX5I6YuhLUkcMfUnqiKEvSR0x9CWpI/8fdczOZxEND9UAAAAASUVORK5CYII=\n",
            "text/plain": [
              "<Figure size 432x288 with 1 Axes>"
            ]
          },
          "metadata": {
            "tags": [],
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 513
        },
        "id": "OzVHi0peDNFY",
        "outputId": "921fbade-d784-4bd8-e463-0870d24d7dcc"
      },
      "source": [
        "def most_frequent(List): \n",
        "    counter = 0\n",
        "    num = List[0] \n",
        "      \n",
        "    for i in List: \n",
        "        curr_frequency = List.count(i) \n",
        "        if(curr_frequency> counter): \n",
        "            counter = curr_frequency \n",
        "            num = i \n",
        "  \n",
        "    return num\n",
        "  \n",
        "df['most_common'] = df['tokens'].apply(most_frequent)\n",
        "\n",
        "df.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>video_id</th>\n",
              "      <th>trending_date</th>\n",
              "      <th>title</th>\n",
              "      <th>channel_title</th>\n",
              "      <th>category_id</th>\n",
              "      <th>publish_time</th>\n",
              "      <th>tags</th>\n",
              "      <th>views</th>\n",
              "      <th>likes</th>\n",
              "      <th>dislikes</th>\n",
              "      <th>comment_count</th>\n",
              "      <th>thumbnail_link</th>\n",
              "      <th>comments_disabled</th>\n",
              "      <th>ratings_disabled</th>\n",
              "      <th>video_error_or_removed</th>\n",
              "      <th>description</th>\n",
              "      <th>text</th>\n",
              "      <th>tokens</th>\n",
              "      <th>most_common</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2kyS6SvSYSE</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>WE WANT TO TALK ABOUT OUR MARRIAGE</td>\n",
              "      <td>CaseyNeistat</td>\n",
              "      <td>22</td>\n",
              "      <td>2017-11-13T17:13:01.000Z</td>\n",
              "      <td>SHANtell martin</td>\n",
              "      <td>748374</td>\n",
              "      <td>57527</td>\n",
              "      <td>2966</td>\n",
              "      <td>15954</td>\n",
              "      <td>https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>SHANTELL'S CHANNEL - https://www.youtube.com/s...</td>\n",
              "      <td>WE WANT TO TALK ABOUT OUR MARRIAGESHANtell mar...</td>\n",
              "      <td>[want, talk, marriageshantell, martincaseyneis...</td>\n",
              "      <td>want</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1ZAPwfrtAFY</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>The Trump Presidency: Last Week Tonight with J...</td>\n",
              "      <td>LastWeekTonight</td>\n",
              "      <td>24</td>\n",
              "      <td>2017-11-13T07:30:00.000Z</td>\n",
              "      <td>last week tonight trump presidency|\"last week ...</td>\n",
              "      <td>2418783</td>\n",
              "      <td>97185</td>\n",
              "      <td>6146</td>\n",
              "      <td>12703</td>\n",
              "      <td>https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>One year after the presidential election, John...</td>\n",
              "      <td>The Trump Presidency: Last Week Tonight with J...</td>\n",
              "      <td>[trump, presidency, last, week, tonight, john,...</td>\n",
              "      <td>trump</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>5qpjK5DgCt4</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>Racist Superman | Rudy Mancuso, King Bach &amp; Le...</td>\n",
              "      <td>Rudy Mancuso</td>\n",
              "      <td>23</td>\n",
              "      <td>2017-11-12T19:05:24.000Z</td>\n",
              "      <td>racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"...</td>\n",
              "      <td>3191434</td>\n",
              "      <td>146033</td>\n",
              "      <td>5339</td>\n",
              "      <td>8181</td>\n",
              "      <td>https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>WATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► http...</td>\n",
              "      <td>Racist Superman | Rudy Mancuso, King Bach &amp; Le...</td>\n",
              "      <td>[racist, superman, rudy, mancuso, king, bach, ...</td>\n",
              "      <td>rudy</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>puqaWrEC7tY</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>Nickelback Lyrics: Real or Fake?</td>\n",
              "      <td>Good Mythical Morning</td>\n",
              "      <td>24</td>\n",
              "      <td>2017-11-13T11:00:04.000Z</td>\n",
              "      <td>rhett and link|\"gmm\"|\"good mythical morning\"|\"...</td>\n",
              "      <td>343168</td>\n",
              "      <td>10172</td>\n",
              "      <td>666</td>\n",
              "      <td>2146</td>\n",
              "      <td>https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>Today we find out if Link is a Nickelback amat...</td>\n",
              "      <td>Nickelback Lyrics: Real or Fake?rhett and link...</td>\n",
              "      <td>[nickelback, lyric, real, fake, rhett, link, g...</td>\n",
              "      <td>nickelback</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>d380meD0W0M</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>I Dare You: GOING BALD!?</td>\n",
              "      <td>nigahiga</td>\n",
              "      <td>24</td>\n",
              "      <td>2017-11-12T18:01:41.000Z</td>\n",
              "      <td>ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"...</td>\n",
              "      <td>2095731</td>\n",
              "      <td>132235</td>\n",
              "      <td>1989</td>\n",
              "      <td>17518</td>\n",
              "      <td>https://i.ytimg.com/vi/d380meD0W0M/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>I know it's been a while since we did this sho...</td>\n",
              "      <td>I Dare You: GOING BALD!?ryan|\"higa\"|\"higatv\"|\"...</td>\n",
              "      <td>[dare, going, bald, ryan, higa, higatv, nigahi...</td>\n",
              "      <td>dare</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "      video_id  ... most_common\n",
              "0  2kyS6SvSYSE  ...        want\n",
              "1  1ZAPwfrtAFY  ...       trump\n",
              "2  5qpjK5DgCt4  ...        rudy\n",
              "3  puqaWrEC7tY  ...  nickelback\n",
              "4  d380meD0W0M  ...        dare\n",
              "\n",
              "[5 rows x 19 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "liU74o_DGYAR",
        "outputId": "924602e2-575d-4908-d877-6d1bf1d32df9"
      },
      "source": [
        "print(df.dtypes)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "video_id                  object\n",
            "trending_date             object\n",
            "title                     object\n",
            "channel_title             object\n",
            "category_id                int64\n",
            "publish_time              object\n",
            "tags                      object\n",
            "views                      int64\n",
            "likes                      int64\n",
            "dislikes                   int64\n",
            "comment_count              int64\n",
            "thumbnail_link            object\n",
            "comments_disabled           bool\n",
            "ratings_disabled            bool\n",
            "video_error_or_removed      bool\n",
            "description               object\n",
            "text                      object\n",
            "tokens                    object\n",
            "most_common               object\n",
            "dtype: object\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JtvJsnOGGo8H",
        "outputId": "b06f679a-6816-42b2-872a-504832244000"
      },
      "source": [
        "from collections import Counter\n",
        "one = df['most_common'].to_list()\n",
        "\n",
        "rslt = pd.DataFrame(Counter(one).most_common(10),\n",
        "                    columns=['Word', 'Frequency']).set_index('Word')\n",
        "print(rslt)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "         Frequency\n",
            "Word              \n",
            "makeup         732\n",
            "late           342\n",
            "cat            316\n",
            "trailer        257\n",
            "show           219\n",
            "news           206\n",
            "movie          202\n",
            "react          188\n",
            "star           188\n",
            "food           165\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wPL-ckQVHiR1",
        "outputId": "2c746f52-850d-4206-b558-671a8f3fa178"
      },
      "source": [
        "def popular(List_1):\n",
        "  first = rslt.index.to_list() \n",
        "  if List_1 in first:\n",
        "    return True\n",
        "  else:\n",
        "    return False \n",
        "\n",
        "df['popular_word'] = df['most_common'].apply(lambda d: popular(d))\n",
        "\n",
        "df['popular_word'].value_counts()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "False    38134\n",
              "True      2815\n",
              "Name: popular_word, dtype: int64"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 23
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 513
        },
        "id": "m4eqJ3kBH6eW",
        "outputId": "e16bb1f5-180c-48e7-f103-c7b554de2f2e"
      },
      "source": [
        "df.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>video_id</th>\n",
              "      <th>trending_date</th>\n",
              "      <th>title</th>\n",
              "      <th>channel_title</th>\n",
              "      <th>category_id</th>\n",
              "      <th>publish_time</th>\n",
              "      <th>tags</th>\n",
              "      <th>views</th>\n",
              "      <th>likes</th>\n",
              "      <th>dislikes</th>\n",
              "      <th>comment_count</th>\n",
              "      <th>thumbnail_link</th>\n",
              "      <th>comments_disabled</th>\n",
              "      <th>ratings_disabled</th>\n",
              "      <th>video_error_or_removed</th>\n",
              "      <th>description</th>\n",
              "      <th>text</th>\n",
              "      <th>tokens</th>\n",
              "      <th>most_common</th>\n",
              "      <th>popular_word</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>2kyS6SvSYSE</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>WE WANT TO TALK ABOUT OUR MARRIAGE</td>\n",
              "      <td>CaseyNeistat</td>\n",
              "      <td>22</td>\n",
              "      <td>2017-11-13T17:13:01.000Z</td>\n",
              "      <td>SHANtell martin</td>\n",
              "      <td>748374</td>\n",
              "      <td>57527</td>\n",
              "      <td>2966</td>\n",
              "      <td>15954</td>\n",
              "      <td>https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>SHANTELL'S CHANNEL - https://www.youtube.com/s...</td>\n",
              "      <td>WE WANT TO TALK ABOUT OUR MARRIAGESHANtell mar...</td>\n",
              "      <td>[want, talk, marriageshantell, martincaseyneis...</td>\n",
              "      <td>want</td>\n",
              "      <td>False</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1ZAPwfrtAFY</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>The Trump Presidency: Last Week Tonight with J...</td>\n",
              "      <td>LastWeekTonight</td>\n",
              "      <td>24</td>\n",
              "      <td>2017-11-13T07:30:00.000Z</td>\n",
              "      <td>last week tonight trump presidency|\"last week ...</td>\n",
              "      <td>2418783</td>\n",
              "      <td>97185</td>\n",
              "      <td>6146</td>\n",
              "      <td>12703</td>\n",
              "      <td>https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>One year after the presidential election, John...</td>\n",
              "      <td>The Trump Presidency: Last Week Tonight with J...</td>\n",
              "      <td>[trump, presidency, last, week, tonight, john,...</td>\n",
              "      <td>trump</td>\n",
              "      <td>False</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>5qpjK5DgCt4</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>Racist Superman | Rudy Mancuso, King Bach &amp; Le...</td>\n",
              "      <td>Rudy Mancuso</td>\n",
              "      <td>23</td>\n",
              "      <td>2017-11-12T19:05:24.000Z</td>\n",
              "      <td>racist superman|\"rudy\"|\"mancuso\"|\"king\"|\"bach\"...</td>\n",
              "      <td>3191434</td>\n",
              "      <td>146033</td>\n",
              "      <td>5339</td>\n",
              "      <td>8181</td>\n",
              "      <td>https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>WATCH MY PREVIOUS VIDEO ▶ \\n\\nSUBSCRIBE ► http...</td>\n",
              "      <td>Racist Superman | Rudy Mancuso, King Bach &amp; Le...</td>\n",
              "      <td>[racist, superman, rudy, mancuso, king, bach, ...</td>\n",
              "      <td>rudy</td>\n",
              "      <td>False</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>puqaWrEC7tY</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>Nickelback Lyrics: Real or Fake?</td>\n",
              "      <td>Good Mythical Morning</td>\n",
              "      <td>24</td>\n",
              "      <td>2017-11-13T11:00:04.000Z</td>\n",
              "      <td>rhett and link|\"gmm\"|\"good mythical morning\"|\"...</td>\n",
              "      <td>343168</td>\n",
              "      <td>10172</td>\n",
              "      <td>666</td>\n",
              "      <td>2146</td>\n",
              "      <td>https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>Today we find out if Link is a Nickelback amat...</td>\n",
              "      <td>Nickelback Lyrics: Real or Fake?rhett and link...</td>\n",
              "      <td>[nickelback, lyric, real, fake, rhett, link, g...</td>\n",
              "      <td>nickelback</td>\n",
              "      <td>False</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>d380meD0W0M</td>\n",
              "      <td>17.14.11</td>\n",
              "      <td>I Dare You: GOING BALD!?</td>\n",
              "      <td>nigahiga</td>\n",
              "      <td>24</td>\n",
              "      <td>2017-11-12T18:01:41.000Z</td>\n",
              "      <td>ryan|\"higa\"|\"higatv\"|\"nigahiga\"|\"i dare you\"|\"...</td>\n",
              "      <td>2095731</td>\n",
              "      <td>132235</td>\n",
              "      <td>1989</td>\n",
              "      <td>17518</td>\n",
              "      <td>https://i.ytimg.com/vi/d380meD0W0M/default.jpg</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>False</td>\n",
              "      <td>I know it's been a while since we did this sho...</td>\n",
              "      <td>I Dare You: GOING BALD!?ryan|\"higa\"|\"higatv\"|\"...</td>\n",
              "      <td>[dare, going, bald, ryan, higa, higatv, nigahi...</td>\n",
              "      <td>dare</td>\n",
              "      <td>False</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "      video_id trending_date  ... most_common popular_word\n",
              "0  2kyS6SvSYSE      17.14.11  ...        want        False\n",
              "1  1ZAPwfrtAFY      17.14.11  ...       trump        False\n",
              "2  5qpjK5DgCt4      17.14.11  ...        rudy        False\n",
              "3  puqaWrEC7tY      17.14.11  ...  nickelback        False\n",
              "4  d380meD0W0M      17.14.11  ...        dare        False\n",
              "\n",
              "[5 rows x 20 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 24
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_2jB31r5H-DN"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}