{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# TF-IDF for tabular data featurization and classification\n", "\n", "This notebook describes applying TF-IDF to database columns. It generate some synthetic data, featurizers the data, applies TF-IDF and visualizes the results with t-SNE. There is a corresponding blog post at [LINK HERE]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Load some libraries and set some plotting configurations" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "%matplotlib inline\n", "from matplotlib import pyplot as plt" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "sns.set()\n", "sns.set_style(\"whitegrid\")\n", "sns.set_context(\"paper\", \n", " rc={\"font.size\":8,\"axes.labelsize\":10,\n", " \"xtick.labelsize\":10, \"ytick.labelsize\":10})" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.manifold import TSNE" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from collections import OrderedDict, Counter" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from faker import Faker\n", "fake = Faker()\n", "fake.seed(101)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Some useful functions and variables" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "synthetic_data_factories = OrderedDict({\"email\": fake.email, \n", " \"phone_number\": fake.phone_number, \n", " \"float_number\": lambda: str(np.random.uniform(-180, 180)), \n", " \"binary_number\": lambda: str(np.random.randint(0, 2)),\n", " \"UUID\": fake.uuid4})" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "placeholders = OrderedDict({\"-\": 1, \"\": 1, \"0\": 1})" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def simulated_column_data(factory, num_rows, placeholders, placeholder_probability=0):\n", " column = []\n", " placeholder_selection_frequency = [\n", " val / sum(placeholders.values()) for val in placeholders.values()\n", " ]\n", " for _ in range(num_rows):\n", " is_placeholder = np.random.uniform(0, 1) < placeholder_probability\n", " placeholder_value = np.random.choice(\n", " list(placeholders.keys()), \n", " size=1, \n", " p=placeholder_selection_frequency)[0]\n", " if is_placeholder:\n", " cell_value = placeholder_value\n", " else:\n", " cell_value = factory()\n", " column.append(cell_value)\n", " return column" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### generate the synthetic data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "dataframe_list = []\n", "placeholder_prob = 0.9\n", "columns_per_class = 20\n", "labels = []\n", "for data_class, synthetic_data_factory in synthetic_data_factories.items():\n", " for column_idx in range(columns_per_class):\n", " labels.append(data_class)\n", " \n", " df = pd.DataFrame(\n", " {\"column_name\": \"{}{:03d}\".format(data_class, column_idx+1),\n", " \"data_class\": data_class, \n", " \"cell_value\": simulated_column_data(\n", " synthetic_data_factory, \n", " 100, \n", " placeholders, \n", " placeholder_prob\n", " )\n", " }\n", " )\n", " dataframe_list.append(df)\n", "df = pd.concat(dataframe_list, axis=0)\n", "labels = np.array(labels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The most common values. Note that values are typically very common (placeholders) or unique (authentic data)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('0', 3083),\n", " ('', 3057),\n", " ('-', 2950),\n", " ('1', 102),\n", " ('sanfordsandra@yahoo.com', 1),\n", " ('walkerkeith@hotmail.com', 1),\n", " ('kjones@carrillo.com', 1),\n", " ('richard31@torres-rodriguez.net', 1),\n", " ('nicole66@gmail.com', 1),\n", " ('robert73@reed-johnson.info', 1)]" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Counter(df.cell_value).most_common(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Featurizers" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import re\n", "import uuid\n", "import phonenumbers\n", "\n", "def regex_feature(value, patt):\n", " return re.search(patt, value.strip().lower()) is not None\n", " \n", "def matches_email(value):\n", " patt = (\n", " r\"^([a-z0-9!#$%&'*+\\/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+\\/=?^_`{|}~-]+)*\"\n", " # Doesn't start with a period\n", " r\"(@|\\sat\\s)\" # requires `at` clause\n", " r\"(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\" # domain name doesn't start or end with a hyphen\n", " r\"(\\.|\\sdot\\s))+\" # one or more second level domains\n", " r\"[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)$\" # top level domain doesn't start or end in a hyphen\n", " )\n", " return regex_feature(value, patt)\n", "\n", "def matches_uuid(value):\n", " try:\n", " uuid.UUID(value)\n", " return True\n", " except ValueError:\n", " return False\n", "\n", "def matches_phone_number(value):\n", " regions = [\"US\", None]\n", " for region in regions:\n", " try:\n", " phone_number = phonenumbers.parse(value, region)\n", " if phonenumbers.is_valid_number(phone_number):\n", " return True\n", " except phonenumbers.NumberParseException:\n", " pass\n", " return False\n", "\n", "def is_float(value):\n", " try:\n", " _ = float(value)\n", " return True\n", " except:\n", " return False\n", "\n", "def is_int(value):\n", " try:\n", " _ = int(value)\n", " return True\n", " except:\n", " return False\n", " \n", "def is_len1(value):\n", " return len(value) == 1\n" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "featurizers = OrderedDict({\n", " \"email\": matches_email,\n", " \"uuid\": matches_uuid,\n", " \"phone_number\": matches_phone_number,\n", " \"is_float\": is_float,\n", " \"is_int\": is_int,\n", " \"is_len1\": is_len1\n", "})" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [], "source": [ "def featurize(value, featurizers):\n", " return OrderedDict({feature_name: feature_value(value) for feature_name, feature_value in featurizers.items()})\n", "\n", "def stringify_features(value, featurizers):\n", " return ''.join([str(int(binary_feature)) for binary_feature in featurize(value, featurizers).values()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here are a few examples of featurization" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OrderedDict([('email', False),\n", " ('uuid', False),\n", " ('phone_number', False),\n", " ('is_float', False),\n", " ('is_int', False),\n", " ('is_len1', False)])" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "featurize(\"---\", featurizers)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OrderedDict([('email', False),\n", " ('uuid', False),\n", " ('phone_number', False),\n", " ('is_float', False),\n", " ('is_int', False),\n", " ('is_len1', True)])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "featurize(\"-\", featurizers)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OrderedDict([('email', False),\n", " ('uuid', False),\n", " ('phone_number', True),\n", " ('is_float', True),\n", " ('is_int', True),\n", " ('is_len1', False)])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "featurize(\"18005551212\", featurizers)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "An example of stringifying a feature set" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'001110'" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "stringify_features(\"18005551212\", featurizers)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Apply the featurizers to the data and update the data frame. The Boolean features are turned into strings of 0 and 1 values which then form the **words** of our document set." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "df.loc[:, \"word\"] = df.cell_value.apply(lambda x: stringify_features(x, featurizers))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The most common feature words. Note that the bifurcation into placeholders and unique values has been removed." ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('000000', 3186),\n", " ('000111', 3185),\n", " ('000001', 2950),\n", " ('000100', 209),\n", " ('100000', 202),\n", " ('010000', 191),\n", " ('001000', 56),\n", " ('000110', 21)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Counter(df.word).most_common()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Generating documents\n", "\n", "Here we generate documents composed of feature words " ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# for actual terms\n", "docs_value = []\n", "for column_name in df.column_name.unique():\n", " docs_value.append(\n", " list(\n", " df.query(\"column_name == \\\"{}\\\"\".format(column_name)).cell_value\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# for feature words\n", "docs_words = []\n", "for column_name in df.column_name.unique():\n", " docs_words.append(\n", " list(\n", " df.query(\"column_name == \\\"{}\\\"\".format(column_name)).word\n", " )\n", " )" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# for unique values, i.e. sub-linear (binary) term frequency \n", "docs_unique = []\n", "for column_name in df.column_name.unique():\n", " docs_unique.append(\n", " list(\n", " np.unique(df.query(\"column_name == \\\"{}\\\"\".format(column_name)).word)\n", " )\n", " )" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TF-IDF vectorization" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "# need an identity function to pass to TF-IDF \n", "# since we've already parsed the terms into documents\n", "def identity_fun(x):\n", " return x" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we define the TF-IDF vectorizer. The relevant arguments are:\n", "\n", "* `analyzer`: we analyze at the word level, as opposed to character n-gram level\n", "* `preprocessor`: function to preprocess the values. Because we've already processed them, the identity function is used.\n", "* `tokenizer`: function to tokenize the values. Because we've already tokenized them, the identity function is used.\n", "* `token_pattern`: regular expression to tokenize the inputs. Because we've already tokenized them, the identity function is used.\n", "* `sublinear_tf`: Boolean to apply sublinear (logarithmic) term-frequency. A sublinear (binary) term-frequency is more appropriate for this problem so we apply it in a preporocessing step below. \n", "* `smooth_idf`: Boolean to apply smooth inverse-document-frequency. " ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "# Instantiate the TF-IDF vectorizer\n", "tfidf_mod = TfidfVectorizer(\n", " analyzer='word',\n", " preprocessor=identity_fun,\n", " tokenizer=identity_fun,\n", " token_pattern=None,\n", " sublinear_tf=False,\n", " smooth_idf=True\n", ")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# generate the TF-IDF vectors for each of the three variations of document\n", "tfidf_value = tfidf_mod.fit_transform(docs_value)\n", "tfidf_words = tfidf_mod.fit_transform(docs_words)\n", "tfidf_unique = tfidf_mod.fit_transform(docs_unique)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Visualizing results with T-SNE plotting\n", "\n", "Here we apply t-distributed stochastic neighbor embedding (t-SNE) to the TF-IDF vectors to visualize similarities. This algorithm provides a way of visualizing similarities between points in a high dimensional space by projecting them to a lower dimensional space (e.g. 2-dimensions) while approximately maintaining relative distances. " ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "tsne_mod = TSNE(metric=\"cosine\", perplexity=20)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The actual-terms documents" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "Y = tsne_mod.fit_transform(tfidf_value.todense())" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "pl_df = pd.DataFrame(Y).rename(columns={0: \"x1\", 1: \"x2\"})\n", "pl_df = pl_df.assign(data_class = labels)\n", "pl_df.to_csv(\"tsne_terms.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "sns.scatterplot(x=\"x1\", y=\"x2\", hue=\"data_class\", style=\"data_class\", data=pl_df)\n", "plt.title(\"t-SNE: document actual terms\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The feature-words documents" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "Y = tsne_mod.fit_transform(tfidf_words.todense())" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "pl_df = pd.DataFrame(Y).rename(columns={0: \"x1\", 1: \"x2\"})\n", "pl_df = pl_df.assign(data_class = labels)\n", "pl_df.to_csv(\"tsne_tfidf.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "a = sns.scatterplot(x=\"x1\", y=\"x2\", hue=\"data_class\", style=\"data_class\", data=pl_df)\n", "plt.title(\"t-SNE: document feature words\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The feature-words documents with sub-linear (binary) term frequencies" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "Y = tsne_mod.fit_transform(tfidf_unique.todense())" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "pl_df = pd.DataFrame(Y).rename(columns={0: \"x1\", 1: \"x2\"})\n", "pl_df = pl_df.assign(data_class = labels)\n", "pl_df.to_csv(\"tsne_tfidf_uniq.csv\", index=False)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEWCAYAAACaBstRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAIABJREFUeJzt3XlAVOX+BvBnhh0RwQUQITVcoJ/inqG5hCEqi7hrSWF6SzMNSc0lc8vMtFK5N8MlTa6lpqJXzRaxzCw3MlFDc0EFFUYUQRiY9f39QUwi20GBM8Lz+YfhrN9558w8c7Z3FEIIASIiIgmUchdARESPD4YGERFJxtAgIiLJGBpERCQZQ4OIiCRjaBARkWQMjQrIzs7G119/Xer477//HqGhoQgJCUFoaCj++OMPAEB4eDjCw8NN0+3YsQPLli0DAERHR6NXr14YOHAgBg4ciLFjx1aopuHDhyM1NfUhnk312b9/P65evVriuD179qB///6IjIys1OWao8p6rU6fPm3afh5sA39/f2g0mjLnDw8Px6VLl4oNnz17drVtS0lJSfj111+rZV2Pso1RcZZyF/A4KQyNYcOGFRun0+nwwQcfYNu2bahfvz7u3r0LrVZrGn/16lUkJibC19e32Lzjx4/HqFGjqrR2Oe3fvx/W1tZo2rRpsXFxcXH45JNP4O3tXanLLY1er4elZdVv9lW5nrZt26Jt27YAHq4NSrNo0aJHXobRaAQAKJVlfx9NSkrC5cuX0a1bN8nLftg2LW0bq65toaZhi1XAypUr8ddff2HgwIEICQnBuHHjTONycnIghICDgwMAwMnJqci8ERERWLNmDaKjox+pBoPBgLlz5yIhIQGtW7eGTqczjfv000/xzTffQKFQYMqUKfD394dOp8PixYtx/PhxKBQKvPXWW/Dy8kJUVBS2bt0KAJgxYwYGDBiAnj17onv37ujfvz8OHTqE9u3bo1+/fvj000+Rn5+PVatWwcPDAyqVCu+++y5UKhXs7e2xePFieHp6Ijw8HL6+vvjtt98ghMCnn36KjIwMHDhwACdOnECdOnWwadMmUxutXr0av//+O6ZMmYJBgwbhhRdewNy5c3HlyhUolUrMnTsXbdq0wf79+xETEwOdTgdXV1csW7YMV65cKbbc0NBQ7Nu3DzY2NtixYwcuX76MqVOnIjw8HB06dMDx48fx8ssvo1WrVliwYAGys7PRoEEDfPjhh3B2dja1Y0JCAr7++mt88MEHWL9+PXbv3o0dO3bg4MGDOHToEN555x1s374d69evB1DwzX3EiBE4evQo1qxZAzs7O6jVaqxevbrE10qlUmHy5MnIy8uD0WjE8uXL4eXlZVr/qlWr4OrqisGDB2P8+PFo3rw53n77bXz44Ydo37496tWrh82bN+OVV14p1gYAEBMTg/3796Nu3bqIiYkxtff9tmzZgiNHjsDe3h4rVqyAq6srwsPDMW/ePNjY2GDSpEnw8vLCmTNn0LVrV8yfPx8AMGfOHJw9exYajQbDhg1DREQEAKBbt24YOHAgjhw5gr59+0Kr1eLNN98EACxcuBCdO3dG//79TetfuXIltFotDh06hJkzZ+LJJ58sdZu6/7XbtGkT2rZti99++w1KpRLz5s3DsmXLkJqaijlz5qB3795FnueD25hGo8GtW7dw5coVtG3bFhMmTChxm0tOTsZbb70Fo9EIPz8/nDlzBrGxsYiOjkbDhg1NX/K6d++Ow4cPm95np06dgsFgQFRUFHr27Ino6Gikp6fj0qVLuHXrFubNm4dnn322xPflsWPH0Lx5cwwdOhRAwZfJ119/vcQvmrISJFlKSooYNmxYqeOjoqJEjx49xKxZs8TPP/9sGj569Ghx8eJFERoaKi5fviy2b98uli5dKoQQYuXKlaJnz54iNDRUhIaGijlz5gghhEhMTBSzZs0qto69e/eKSZMmCSGESEpKEt7e3iIlJUWcOnVKDBs2TGg0GpGeni6ee+45kZOTI2JjY8WMGTOE0WgURqNRZGVlFXseb7/9tjh48KAQQohWrVqJhIQEYTQaxaBBg8R7770nhBBiw4YN4qOPPhJCCBEZGSn+/PNPIYQQx44dM9UzevRo8e9//1sIIcS6devExx9/XGz5DypsGyGEWLJkidi/f78QQohr166JoUOHCiGEuHv3rjAajUIIIT7//HMRExNT4nKfe+45kZ+fL4QQRdp49OjRYtmyZabpIiIixI0bN4QQQsTFxYkPPvigSE0ajUYEBQUJIYSYNGmSGDx4sMjNzRUff/yx2LNnj0hLSxN9+/YV2dnZ4t69eyIwMFCkpKSII0eOiM6dO4tbt26V+VqtW7dOrFixQgghhFarFXl5eUXWf+TIETF79mwhhBCjRo0SL730kumxSqUSR44cEZGRkaW2QVxcnBBCiAULFogtW7aU2OaFbbNt2zYxc+bMIq9FSkqKaNOmjbhy5YrQ6/UiNDRUJCcnCyGEyMzMNLXRsGHDRFpamhCiYLsp3OazsrJE//79hdFoFDqdTgQEBJhel0L3vz5ClL1N3f/ajR49WqxatUoIIcSiRYvEsGHDRH5+vkhKShIjRowo9lzvf15CFLzfXnjhBaHVaoUQpW9z48aNEwcOHBBCCPHBBx+I0aNHm+b/8ssvTcvu1q2bEEKI//73v+KLL74QQghx584d0/NfuXKlGDNmjNDr9eL06dOmGkt6X168eNG0ntu3b4vg4OASn4/cuKdRiT766CMkJiaavj1NmjQJI0aMMI0fM2YM1q1bh44dOxaZr6TDU/cfgrjfyZMnTd/YvL298eSTTwIAfv/9dwQEBMDa2houLi546qmncPHiRRw5cgTjxo2DQqEAADg6OiI7O7vU51CvXj1TfS1atMAzzzwDAGjVqhW2b98OADh69CguX75smsfOzs70uE+fPgAAHx8f0/RSHTlyBL/88gtWrlwJALh37x4A4ObNm4iMjERGRgby8/OLtZ8UgYGBAAr2CE+ePInx48cDKDiccv+3fACwtraGg4MDMjIykJmZid69e+PUqVM4efIkRowYgdOnT8PPzw9169YFAPTq1QuJiYlo0KABOnXqhIYNGwIo/bVq06YNZs6cCQsLC/Tr16/Y+n19fbFo0SJcvnwZLVq0wPXr15GTk4OsrCw0atSoSNuX5P7XICUlpcRp+vbta2qXNWvWFBvv5eVlOuTVunVr3LhxA82aNcPevXuxbds26PV6pKenIzk5Ga6urrC3t0ePHj0AFGxjrVq1QkJCArKzs/H000/DxsamzJrL2qYKX7tChXsTrVq1gkKhgI2NDVq1aoUbN26UuY5Cffr0gZWVFYDSt7lz587hueeeAwD0798fZ86cKXOZR44cwcWLF03bvFqtRkZGBoCC7cPCwgJPPfWUqcaS3peOjo7QarW4ceMG4uPjMWDAAEnPp7oxNB7B9OnTcf78ebRp08Z0PNjX1xe+vr7w8vJCXFxckdAIDg7GqlWr4Onp+dDrFA90FXb//4Ub4P3DhRBFhgOAhYWF6dgzgCLnXgrfTEDBcenC/5VKJQwGg+lxXFxcicetS5q+Is9tzZo1cHV1LTL8vffew8SJE+Hn54cDBw5g9+7dJc6vVCpNz+v+5wQU/RBydXXFrl27yqylQ4cO2LVrFzw8PNCpUyccP34cKpUK7u7uOHv2bIlt/eB6Snutnn76aWzatAkHDhzAxIkTMXfuXPj5+RVZhrW1NQ4cOICOHTvC2dkZmzdvho+PT5k1F7K2tja1R2mvwf31P7h93L+MwuXo9XqkpKRg8+bN+Oqrr+Dg4IBJkyaZ2tne3r7I/IMGDcL//vc/ZGVlYfTo0eXWXNY2dX+bAkW3sZK2t5Lel6Utr7Rtrqw6S9rGhBBYtGhRiV9oSno9SnpfAkBoaCh2796N/fv345NPPpFUU3Xj1VMVYG9vj9zcXNP/H374IXbt2oVFixYhNzcXJ06cMI27ePEi3NzcisxvaWmJF154AV9++eVD19CxY0fs27cPQMG3oeTkZNPwH374ATqdDrdu3UJSUhJatGgBPz8/bNmyBUIICCFMx/HT0tKgVqtx7949JCQkVKiGTp06Ydu2bQAKvqlfuHChzOkfbLfS+Pn5FWmbc+fOASjYO3B1dYXRaCzyYf/gct3d3ZGUlASj0YiDBw+WuA4HBwc4Ojril19+AVDwxi/pm3unTp3wxRdfoGPHjmjXrh3i4uLQunVrAAVfDH799Vfk5OQgNzcXhw4dKvG4c2mv1fXr19GoUSO88MIL6N+/P86fP19s3g4dOmDjxo3o2LEjOnbsiI0bN6JDhw7FppPatg/67rvvTH9LWm5JcnNzUadOHdSpUwdpaWn47bffSp322WefxYkTJ/DXX3+hc+fO5dZd0W2qLPe/L8tT2jbn7e2NH3/8EQBMryEANGnSxDRN4fjC5Xz11VemQCmcpqz1Pvi+BAq+WG7ZsgU2Njbw8PCQ8nSrHUOjAurXr48WLVogJCQEa9euLTJOCIFVq1ahX79+CAkJwalTp/DGG28UW8awYcOQn59fZNhnn31muuS28Mqs06dPY/bs2cXmDwwMRN26dRESEoIvvvjC9O3T19cXvXr1wqBBg/DKK6/gnXfeQZ06dTBixAhYW1sjJCQEAwcOxMmTJ2FtbY2XX34ZgwYNwvTp000fhlLNmTMHP/30E0JDQxEcHFzupZP9+/fHypUrMXDgQOTk5JQ63cSJE5Geno6QkBAMGDDAtEfxxhtv4NVXX8Xo0aPRuHHjUpc7YcIEzJgxAy+//DIaNWpU6nqWLl2KNWvWIDQ0FIMHDy7xDd6hQwekp6ejY8eOqFOnDpycnEwfrq6urvjXv/6FkSNHYsSIEYiIiCjxDV7aa3Xs2DGEhoYiLCwMJ0+eRGhoaLF5O3bsCIPBgCeeeAIdOnTArVu3SvwWK7VtH6RQKDBs2DB8/fXXphPW5fH29kbTpk0RHByMefPmoVOnTqVOa2FhAT8/P/Tt27fEb9RPP/00Tp06ZTp5XtFtqrKUts3NmjUL0dHRGD58eJG9roCAAFy5cgVDhgwpEmwjR45EgwYNMHDgQAQFBRX7fHhQSe9LoODwcKtWrTBw4MAqeLaVQyEe3IcmIqoEI0aMwJIlS9CsWTO5S3kkly5dwrx58xAbG1vl69JqtQgNDcXXX39tOmdmbrinQUSV6vbt2wgICECbNm0e+8CoTklJSejXrx+GDh1qtoEBcE+DiIgqgHsaREQkGUODiIgkkzU0Zs6cCT8/PwQHB5uGRUdHo0ePHqariUq7dJKIiKqfrOc0jh8/Dnt7e7z99tvYs2cPgILQsLe3r3BvrwAqfL8BERGhzMunHyTrHeFdunSp9K6YK/LkzUFSUpLkO31rK7ZR2dg+5WMbla6iX7bN8pzGpk2bEBISgpkzZyIrK0vucoiI6G+yX3KbmpqK8ePHmw5PZWRkwNnZGQqFAitWrIBKpcLixYslLSshIaFYHzjmLj8/H7a2tnKXYdbYRmVj+5SPbVQ6tVr9+ByeKklhD6FAQZcbhb2RSvW47YJyt7l8bKOysX3KxzYq3WN/eEqlUpke79+/Hy1btpSxGiIiup+sexpRUVE4duwYMjMz0bNnT0yaNAnHjh0zdSDXpEkTLFiwQM4SiYjoPrKGxscff1xsWEm/v01EREXl5ulgFEbYWlnAyqr6PsrN7pxGdbunyYXOoIPG8M8PqiiggLWFFZQKBZzs6slYHRFRURqNDndzdfj8f2eg0Rkwqq837Gwt4FTXBo72Zf9CYmUwu3Ma1eluXhZytLnQGLQwCiOMwggFFJj8zbu4maNCvl6Le5qK/8ANEVFVyVbrsGzTCfx6+iYSzqmwYN0RZGZroNEYkJunLX8Bj6hW72nk6TV485u5RYatGDAfH/d7F3ZWtlBAgTx9Pura1JGpQiKif2TcVeNq2j2o7uSZhmXnamGhVOCeWgelUoE6dmUsoBLU6j2NktzJu4uobxcgX68pcsiKiEhuRgH8mngDQ/1bmIb17foEbmTkwkKpgN5grPIaavWeRlmEEEDxX6kkIpKNhVKBy9ez0KtDE/xn2nNQKBQwGI1Q5+mh1Rtga1P1H+m1OjTsLG2wYsB8AAV7GACg0RfsXWRp7qGBnbNstRERPai+oy1mv/I0rqty8On2RPx1LRN2Npa4p9bi31Ofg41V1R88qtWh4WRXD3n3VLiTdxfzf/ykxGnsLNn1ABGZB4VCAUulEkqlAmND2+CjTQm4m6PBpGHt4WBvBae6Vf95VatDAyjY22js4GLa4wCKXnLLk+BEZE6cHW1ha22BPK0eC17zAwDYWlnA0aHqL7cFGBq8D4OIHjt2tlaws7WSZd28eoqIiCRjaBARkWQMDSIikoyhQUREkjE0iIhIMoYGERFJxtAgIiLJGBpERCQZQ4OIiCRjaBARkWQMDSIikoyhQUREkjE0iIhIMoYGERFJJmtozJw5E35+fggODjYNu3v3LsaMGYO+fftizJgxyMrKkrFCIiK6n6yhMXjwYKxdu7bIsNWrV8PPzw/ff/89/Pz8sHr1apmqIyKiB8kaGl26dEG9ekV/BCk+Ph5hYWEAgLCwMOzfv1+O0oiIqARmd07j9u3bcHFxAQC4uLjgzp07MldERESFatzPvSYlJcldQoXk5+c/djVXN7ZR2dg+5WMbVR6zC40GDRpApVLBxcUFKpUK9evXr9D8Pj4+VVRZ1UhKSnrsaq5ubKOysX3KxzYqXUJCQoWmN7vDU/7+/ti5cycAYOfOnejTp4/MFRERUSFZQyMqKgojR45EcnIyevbsia+//hqvvvoqDh8+jL59++Lw4cN49dVX5SyRiIjuI+vhqY8//rjE4V988UU1V0JERFKY3eEpIiIyXwwNIiKSjKFBRESSMTSIiEgyhgYREUnG0CAiIskYGkREJBlDg4iIJGNoEBGRZAwNIiKSjKFBRESSMTSIiEgyhgYREUnG0CAiIskYGkREJBlDg4iIJGNoEBGRZAwNIiKSjKFBRESSMTSIiEgyhgYREUnG0CAiIskYGkREJJml3AWUxt/fH3Xq1IFSqYSFhQV27Nghd0lERLWe2YYGAHzxxReoX7++3GUQEdHfeHiKiIgkM+vQGDt2LAYPHowtW7bIXQoREQFQCCGE3EWUJD09Ha6urrh9+zbGjBmDOXPmoEuXLmXOk5CQAHt7+2qqsHLk5+fD1tZW7jLMGtuobGyf8rGNSqdWq9GpUyfJ05vtOQ1XV1cAQIMGDRAQEIDExMRyQwMAfHx8qrq0SpWUlPTY1Vzd2EZlY/uUj21UuoSEhApNb5aHp9RqNXJyckyPDx8+jJYtW8pcFRERmeWexu3btzFx4kQAgMFgQHBwMHr27ClzVUREZJah4enpif/9739yl0FERA8wy8NTRERknhgaREQkGUODiIgkY2gQEZFkDA0iIpKMoUFERJIxNIiISDKGBhERScbQICIiyRgaREQkGUODiIgkY2gQEZFkDA0iIpKMoUFERJIxNIiISDKGBhERScbQICIiyRgaREQkGUODiIgkY2gQEZFkDA0iIpKMoUFERJKZbWj8/PPPCAwMREBAAFavXi13OUREBDMNDYPBgAULFmDt2rXYu3cv9uzZg4sXL8pdFhFRrWeWoZGYmIimTZvC09MT1tbWCAoKQnx8vNxlERHVemYZGunp6XBzczP97+rqivT0dBkrIiIiALCUu4CSCCGKDVMoFJLmTUpKquxyqlR+fv5jV3N1YxuVje1TPrZR5THL0HBzc0NaWprp//T0dLi4uEia18fHp6rKqhJJSUmPXc3VjW1UNrZP+dhGpUtISKjQ9GZ5eKpt27a4cuUKUlJSoNVqsXfvXvj7+8tdFhFRrWeWexqWlpZ49913MW7cOBgMBgwZMgQtW7aUuywiolrPLEMDAHr16oVevXrJXQYREd3HLA9PERGReWJoEBGRZGWGRk5ODq5du1Zs+Llz56qsICIiMl+lhsY333yDfv36YdKkSQgKCkJiYqJp3MyZM6ulOCIiMi+lngiPiYnBjh074OLigsTEREyfPh1RUVHo27dviTffEZH8tJmZpsfCKKC0toJV3boyVkQ1TamhYTQaTTfU+fr6YuPGjRg/fjzS0tIk351NRNVHm5kJQ14+oFBAl5kJGxcXGHV6aDMzYe3sLHd5VEOUeniqTp06Rc5nuLi4YOPGjYiPj8eFCxeqpTgikqYwMPJu3ITCQglLx7rIT09HfloaDHn50GXfk7tEqiFKDY158+bBaDQW6ZLcwcEBa9euxXvvvVctxRGRdMJoBAAYtTrkp6kAoxEGdR5+n/AG9Lm5MldHNUWpoeHt7Y1mzZohMjISq1evhhAC+fn5+OCDD/DVV19VZ41EVA5DXj5OTpwMe88mUCiVsKxjjzPvzIWFnS06r1sNpa2N3CVSDVHufRpbt25FWloaRo4ciaFDh8LFxYWhQWRmlHa2aLP4PRi1Ovw+4Q2cnjEbAHBm1hwYtVooLc228wd6zJQbGpaWlrCxsUF+fj40Gg08PDygVPKeQCJzogBg29gNFvZ26PjZf9B2ccEh5LZL3oeFnS2voKJKU+6n/9ChQ2Fra4tt27bhyy+/xJ49ezB58uTqqI2IJLJ2doaNszOMGi1+Hz8RVk5OAAArR0deOUWVqtx91kWLFqFt27YAgEaNGmHVqlXYuXNnlRdGRBVnYW+Hjqv+DQs7W9NfospUbmgUBsb9wsLCqqQYIno01k5OwN97GeAOBlUBnpwgIiLJGBpERCQZQ4OIiCRjaBARkWQMDSIikoyhQUREkjE0iIhIMoYGERFJxtAgIiLJzK7ry+joaGzduhX169cHAERFRaFXr14yV0VERIAZhgYAREREYOzYsXKXQURED+DhKSIikswsQ2PTpk0ICQnBzJkzkZWVJXc5RET0N4UQQlT3SiMiIpCRkVFseGRkJNq3bw9nZ2coFAqsWLECKpUKixcvlrTchIQE2Nvbm/4XQsBoNEKhUEChUFRa/ZXJaDTW6B+1EkJACAGlUvnQr0F+fj5sbdnFd2nYPuVjG5VOrVajU6dOkqeX5ZzGhg0bJE03bNgwjB8/vkLL9vHxMT1OTU2Fo6MjHB0dK7SM6pSXlwc7Ozu5y6hS2dnZyM7OhoeHx0PNn5SUVOR1paLYPuVjG5UuISGhQtOb3VdclUplerx//360bNnyoZel0+nMOjBqC0dHR+h0OrnLoEpg1GoAAHq9Bvq/H1PtYnZXTy1duhTnzp0DADRp0gQLFiyQuSIiAgD9vTvQ3LgAGw9vGNXZBcPqucDS2kbmyqg6mWVoEJF5Ul/6HXeP7II+KwOug9+ChV4HMDRqFbM7PFXTBAcHlzg8Ozsbu3fvrtJ1EFUq2zqo93QINKnnYeHgBKv6jWFh7yB3VVTNGBoyyc7Oxp49e+Qug0gyhV6HW3s+ReMX58HGrTlyzx2BQZMnd1lUzczu8NTjzmAwYPr06VCpVOjYsSMAYM+ePdi2bRvu3buHZ599FlOmTMHGjRvxxx9/YOzYsZg0aRLOnz+PgwcPIjMzE8OHD8cLL7xQ4vJ3796N2NhYWFtbY8CAAUWmK2k9KSkpmDZtGqysrGBra4s1a9Zgy5Yt2LZtG2xtbREYGIjRo0dXS9vQ483CzgFuo96BMAo4N2hSMMymZl/5R8UxNCpZfHw8nJyc8NFHH+HYsWOIj4+Hv78/goODIYTAqFGjoFKp8NJLL+Hq1atYvnw57Ozs0K5dO0RERECr1WLgwIEYOXJksfs37ty5g88//xybN2+GjY0NDAZDkfElrefo0aMIDAzEmDFjYDQaARSES0xMDOrXr28aRiSFhU3hfVB1ZK2D5MPQqGRXrlxBu3btAMD098iRI9iwYQOMRiNSUlKgUqng5ORUZL5vvvkGcXFxUCgUuHXrFrKysuDs7FxkmtTUVPj4+MDGpuDEo4WFRZHxJa1nwIABWLVqFaZNm4aWLVvi1VdfxYwZM7Bs2TLo9XoMHz4cnTt3rqrmIKIahqFRyZo1a4Zjx44hNDQUiYmJAIAVK1YgNjYWDg4OGDlyJIQQsLKygl6vN823evVq7N27F0ajEf369UNJN+p7enri3Llz0Gg0sLGxKXY3eUnrUSqVeOuttwAAY8aMQUBAALy8vPD+++8jPT0db775JjZv3lzFrUK1gf7enYIHSksoLS2htLEvewZ6LDE0KlmfPn3w3XffITw83LSnERQUhBdffBEtWrQwdXPSqFEjAMDUqVPxyiuvoHv37hg5ciRatmyJevXqlbhsZ2dnREREYPTo0bC1tUX//v2LnNMoaT0HDhzAl19+aVqnp6cn3nnnHVy/fh0ajabUcydEFWHQqJF7/ig0aZdRp/UzuPvrdrgNnwULO15dVdPI0vdUVUlISCjSh0pycjKaN28uY0Xlqw3diACP9lqwC4iymUv7GNTZuPnVAmjTkuH5+n+gqNsQlpbm8b3UXNrIHD34uVke83hFqZiLFy9i/vz5RYaNHz8e3bt3l6kiotLpcu9Cn5EKozoHtk88hczD21G/1yigbn0Y8nK4x1GDMDTMVIsWLRAbGyt3GUSSKGxsYWFfD24vvAulrT20ackQRgMMeTlQ7fwELmFTGBw1BEODiB6ZpaUtdE4NAVjC0soK8HwKGTs/gfpiAhr0fQWKGtz9f23DV5KIKoWVlR2srKwK/tGqobubDitnN2jSLkM8cE8RPb4YGmZs0aJFuHfvHo4ePcrefumxInQaOHUfjCZjlsDK2Q0QvIm0puDhKTM2e/ZsuUsgeiiWTq6oU8cJSmtb1OsSBKU1fzWvpmBo/C3tdi5uZOTCvWEduDV4tC4SlixZgjNnzkCv12PmzJmYOXMm/Pz8cOrUKfj5+UGpVOLIkSNwd3fHokWLcOfOHUydOtX0Q0XLly9HgwYNEB4ejhUrVlTG0yOqVgqFAoq/g4KBUbPw8BSAHT9eQNTyg1iy8Tiilh/Ejh8vPPSyDh48CKPRiNjYWPznP//B0qVLkZeXh5EjR2Lr1q349ttv0blzZ2zevBm3b9/G9evXUbduXcTExCA2NhZBQUHYvn17JT47IqLKU+v3NNJu52LbgQu4p/7n50i3Hbhnv+4TAAAXZUlEQVSAbr7uD7XHceHCBfzyyy8IDw8HAOTm5sLOzg4tWrQAALi6uuKpp54CADRu3BhZWVnIysrC/PnzkZmZiXv37rEvKCIyW7U+NG5k5MJgLHpTvNEocDMj96FCo0WLFvD39zf196TVajF48OBSpxdCYPfu3ejSpQteeuklbNq0CZcuXarweomIqkOtDw33hnVgoVQUGaZUKtC44cOd1+jduzdOnDhh2tPw9fUtdx4/Pz9MmzYNhw8fhqurq9l0vUBE9CD2PYWCcxrbDlyA0SigVCowrE9LDOrdsipLNWHfU+Vjv0FlY/uUj21UOvY99RAGP9cS3XzdcTMjF40r4eopIqKaiqHxN7cGDAsiovLwklsiIpJMltDYt28fgoKC4O3tjdOnTxcZFxMTg4CAAAQGBuLQoUNylEdERKWQJTRatWqF6OhodOnSpcjwixcvYu/evdi7dy/Wrl2L+fPnw8COzoiIzIYsoeHl5YUnn3yy2PD4+HgEBQXB2toanp6eaNq0qel3tomISH5mdU4jPT0dbm5upv9dXV2Rnp4uY0WVY8aMGcUOwz0OduzYgXXr1sldBhGZkSq7eioiIgIZGRnFhkdGRuL5558vcZ6SbhlRKBQlTFm6pKSkIsvLy8srdx5hNECfmQarBk2gu30dls5uUCgtKrTesuj1emg0mhJrkVqjHLRaLXQ6XYXrMxgMsLAo2n55eXlFXpuKyM/Pf+h5awO2T/nYRpWnykJjw4YNFZ7Hzc0NaWlppv/T09Ph4uJSoWXcfwNPcnJyuTfOCaMBaVveR37qeTh2CkR2wnew9WgNtxGzHio4jh49inXr1sHKygqpqamYPHkyLC0t8e233yImJgbZ2dmIiYmBk5MTNm3ahB07dkCpVGL8+PHo06cPZsyYAQcHB1y9erXItHv37sWmTZtgNBoxZMgQDBs2rMR1r1+/HjY2Nrh06ZIpoGfMmIEXX3wRbdu2xY4dO5CZmYmxY8ciICAAnTp1QlJSEoYOHYqLFy/izJkzeOaZZzBt2jRYW1vj/PnziIyMRHp6OubNm4f27dvj3LlzWLx4MYxGI5o3b4758+cjLi4OP//8M/Lz8/Hcc89hxIgRRWqzs7PjzX1VhO1TPrZR6RISEio0vVkdnvL398fevXuh1WqRkpKCK1euSOqG41Ho7txEfup5CG0eso9/A6HNQ37qeeju3HzoZapUKqxYsQKxsbH45JNPABScx1mzZg169OiB+Ph43L59G3FxcdiwYQPWr1+Pjz76CEajscRp7969i6+++gqxsbH48ssvERcXh5ycnBLXfffuXXzyySdYvXp1ucGdkZGB6dOnY+vWrfj4448RERGBbdu2IT4+3tRNe15eHmJiYhAdHY2lS5cCKPhxqGXLliE2Nhb29vb45ZdfABTsUX322WfFAoOIag5Zbu774YcfsHDhQty5cwevvfYafHx8sG7dOrRs2RL9+/fHgAEDYGFhgXfffbfYYY7KZt3Qo2AP4/g3EHotFJbWcOwUCOuGHg+9zDZt2sDS0hKOjo6wt7eHWq3G//3f/wEA3N3dcffuXaSmpsLb2xuWlpaws7ODi4sLMjMzAaDYtNeuXcPVq1cREREBAMjOzkZ6ejocHByKrdvHxwdKpRLu7u7IysoCUPQQ3/2HAN3d3VG/fn3T48I9gUaNGplCqV27dgAAT09PZGdnAyjoyTcqKgpAQS++LVu2hEKhqPKAJyL5yRIaAQEBCAgIKHHchAkTMGHChGqrRZuRiuyE70yBIfRaZCd8h7q+zz10cJw9exYGgwFqtRpqtRr29vbFPrg9PDxw7tw56PV65OTkID09Hc7OzgCKf8h7enrCy8sLn3/+OZRKJXQ63T+/xfyAkgLC0dERaWlpaNu2LU6fPg1PT89i0z547qhw3sKr11JSUuDo6Aig4JLp5cuXmwJHp9Nh9+7dVR7wRCS/Wt+NiFX9xrD1aF3snIZV/cYPvcyGDRti4sSJuHnzJiIjI7F///5i0zRo0ABhYWGIiIiAQqFAVFQUlMqSjxY6Oztj+PDhCA8Ph1KphK2tLWJiYkqd/kFDhw7F1KlTsX37dlMwSWVtbY1//etfyMjIwNy5cwEAs2bNwltvvQW9Xg+lUol33nmnQsskoscXe7lFwclw3Z2bsG7oAW1GKqzqN37oq6eOHj2K7777Du+++66k6dnLbfl4ErNsbJ/ysY1Kx15uH4JCaWE6FPUo5zKq0+7du7F169Yiw1asWGE6ZEREVBUYGpWsa9eu6Nq1a5WvJyQkBCEhIVW+HiKi+5nVJbdERGTeGBpERCQZQ4OIiCRjaBARkWQMjUo2d+5cDB8+HAMGDKjwvDt27Ki2zgtTU1Px2muvVcu6iKjm4NVTAKZ9+x6yNf/05eRo44Cl/R7uhrXff/8du3fvRnBwcIXnjYuLQ+/evc36vo2SerAlotqDoQEgW5ODzPysR17OokWLkJKSgvDwcKjVagDApUuXMHfuXAgh0Lx5cyxcuBCZmZmYOnUqdDodDAYDoqOjcenSJSQlJeH1119Hx44dMX369GLL79u3L7p164azZ8/imWeewVtvvVWk19rU1FQsXLgQMTExCA8PR+vWrXH+/Hm4urrC19cXP/zwA5RKJdauXQsAyMrKwpQpU3D16lUMGzYMo0aNQlZWFt555x1kZWXBxsYGS5YsgVqtxtSpU+Hu7g5nZ2fMmTPnkduKiB5PPDxViWbPng0PDw9T768AsGzZMsyYMQObNm2CpaUl4uPjUbduXcTExCA2Nhb9+/fH9u3b8fTTT8PHxweffvppiYEBALdu3cLkyZOxdetWfP/996aeaEvTo0cPxMbG4tatW7C1tUVsbCyaNGli6go5NTUVixYtwubNm7F582bk5ORg9erVCAsLw8aNGzFq1CisX78eAHD9+nW8//77DAyiWo57GlUsNTUVbdq0AQB06NABycnJaN++PebPn4/MzExkZWXh6aeflrSs+3uldXFxQU5OTqk92AJA27ZtAQCNGzeGt7e36XFWVhY8PDzQsmVLU7g9+eSTuHnzJi5cuICEhARs2LABer0eLVu2BAC0bt0atra2j9ASRFQTMDRQcA6jrP8fRZMmTXDmzBm0adMGJ0+exLPPPovdu3ejS5cueOmll7BhwwZcu3YNAGBlZQWDwVDqskrqidbR0RFnz54FgDJ/UrakcLlw4QLy8vJgaWmJ5ORkNG7cGF5eXujWrRt69OgBoODX+1QqFc9jEBEAhgYAPPRJbymmTp1q6rywadOm8Pf3x19//YVp06bh8OHDaNCggekbfJ8+fTBt2jR07dpVcvfw3bt3x4YNGzB27Fi0bt26QrV5eHhgxowZSElJwYgRI+Dg4IDx48fj3Xffxdq1ayGEwPDhw9G+ffuKPWkiqrHYy63M2Mtt+dhDadnYPuVjG5WOvdzWABcvXsT8+fOLDBs/fjy6d+8uU0VERAUYGmaoRYsWiI2NlbsMIqJieMktERFJxtAgIiLJGBqV7OjRo1iwYEGRYTt27MCxY8dkqujRPEx3KERUc/GcRjUYPHjwI83/uPX39LjVS0TSyRIa+/btw7///W9cunQJX3/9tenO5dTUVAwYMMB0aWa7du2KfWt/HKSmpmLixIlITU3F5MmT8eeff6Jly5Zo06YNpk6diieeeALnz5/H8OHDMXjwYBw5cgSfffYZ8vPz0bx5cyxevBipqalF+ntKSkrCZ599BkdHR+zbtw+XLl3CG2+8UWzdQUFB6NOnDxISEtC4cWMsW7YMR48exXfffWe6XyQ4OBh79uzBjBkzYGFhgbS0NBgMBoSFhWHXrl3IycnBmjVr4OTkBK1Wi9mzZ+PChQvw8/PDlClToNVqMXfuXFy/fh0AsHDhQjRt2hQBAQHo2rUrcnJysHz58mptcyKqHrIcnmrVqhWio6PRpUuXYuOeeOIJ7Nq1C7t27arWwDj5ZhSORYzDyTejHnlZKpUKK1asQGxsLD755JNi49577z1s3rwZmzZtAgD4+vpiw4YN2Lx5MzQaDf744w8ARft7CgkJwZ49ewAAu3btwqBBg0pcd15eHgYOHIhNmzZBpVIhJSWlzFqfeuoprFu3Du7u7khOTsb69evRu3dvxMfHAwBu3ryJyZMnY8uWLfjjjz+QnJyMbdu2wcfHBxs3bsTs2bOxYsUKAEB6ejoiIyMZGEQ1mCx7Gl5eXnKstlQn34yCRnULBrUaRo0GJ9+MQocVHz/08tq0aQNLS0s4OjrC3t7e1OMtUHA5rbW1NQBAqSzI7PPnzyM6OhparRY3btxAYGAgGjZsWKS/p+DgYEyYMAHPP/88dDodmjRpUuK67ezsTO3r7u6OrKwsyf1TtWjRwvT47t27pseurq6maa9evYoLFy7g1KlT+OGHH0zrBAq6TGnYsOHDNhsRPQbM7pxGamoqwsLC4ODggMjISHTu3LnK16nLyobh7w92g1oNXVb2Iy3v7NmzMBgMUKvVUKvVpk4BgeL9RwFATEwMpk+fDm9vb7z55pumD/b7zwvUrVsX7u7uWLZsGcLCwkpdd2n9U928eRNA8f6p7p++pHC5efMmVCoVGjVqhLNnz2LIkCFo0aIFWrdujZEjRwIo6J/qwXqJqGaqstCIiIhARkZGseGRkZF4/vnnS5zHxcUFP/74I5ydnXHmzBlMnDgRe/fuhYOD9A4Ek5KSTI+FEJJ+Cc+yrgMMmnwY1XlQ2tvBsq7DQ/+CnkajgbOzMyZMmICbN2/i9ddfx7lz56DVapGfnw+DwWBattFohBACffr0wZQpU9CsWTMAKHFaAAgJCUFkZCRmzpxZan1Go9E0Tq/XQ6PRwMvLCwqFAi+++CI6depkmqZwfF5eHnQ6HbRaLfLy8qDVaqHT6ZCXlwdXV1csXboUycnJ6Nq1K9zc3BASEoL333/fdLisZ8+eCA8PL7LuB+Xl5RV5bSoiPz//oeetDdg+5WMbVSIho9GjR4vExMSHHv+gEydOFPn/8uXLkuf9ffIUcfTlseL3yVMkz1MZ1Gq15GmPHz8u5s6dW3XFVKGKvBYP+vPPPyuxkpqH7VM+tlHpHvzcLI9ZHZ66c+cO6tWrBwsLC6SkpODKlSvw9PSslnU/yjmM6rBz507897//NZ1Yv3v3LiZNmlRkmiFDhpR56IqI6FHJEho//PADFi5ciDt37uC1116Dj48P1q1bh+PHj2PlypWwsLCAhYUF5s+fDycnJzlKNDthYWFFAsHJyYn9UxFRtZMlNAICAhAQEFBseGBgIAIDA2WoiIiIpGA3IkREJBlDg4iIJGNoEBGRZAyN+9xKu/fIy3iwl9vTp09jxowZxXqLLfx/165dWLduHQAgPDwcI0eOREREBMaMGYNffvnlkeshosqRc08Do7HG/Dr2QzOrS27l9NO35/HrT5fQrbcXevdrLVsdn376KerXr4+MjAy8+uqraNy4sdl1u0JUmxh0BuTmavHtzrMICPFBHQdrWNtYyV2WbLingYLAOHooGXqdEUcPJeOnb8/LXRIaNmyIUaNG4bvvvpO7FKJaqzAwNn9+HOdOpyFu00nk5miRl6uVuzTZ1PrQKAwMTb4eAKDJ15tNcLi6ukKlUsldBlGtZNAZkKsuCIy069lo5OqA/kPa4k6GGhqNHurcfLlLlEWtDo1baffw60+XTIFRSJOvx68/XXqocxy2trbQaDT/LEujgY2NDRQKRbEeZsuTlpZm6mGWiKqPXqdHdnY+9FojHOraoJGrA0JGtMPW9Sfwx7FrsLBQIPFEKvLUtW+Po1aHRiO3uujW2ws2tkVP7djYWqJbby80cqtb4WU2a9YMf/75p6nn199++w3e3t7w8vJCQkICAODq1atwdHQsczkZGRn48ssv0bdv3wrXQESPxqgXyFDlYO3KQxgwpC0GvdgB22N/R1ZmHs7+cRPf/+9P+LR1h6iFJ8Zr/YnwwpPehYeobGwt0bVH84c+GV6vXj289NJLCA8Ph5WVFdzc3PDaa6+he/fumDt3LvR6PYxGI2bNmlXi/K+//jpsbGwAAFFRUTwJTiQDazsrNPaoh/5hbZCn1mFf3BlkZf7Tg/PZP27Co1l9tO3gLmOV8qj1oQH8Exy//nTpkQKj0KBBg4r9st4TTzyB9evXF5t24MCBph8xYl9SRObDoa4tnvBqgLTULAwY3AZfrT2Oe9kF5zFat3HDU76NYe9gI3OV1Y+h8bfe/Vrj/9q7P9QhKSKqmeo5FXyhy72nwYhXOmPL5yfg/oQTBgxug7r1bGWuTh4MjfswMIjoQfWc7AAhoLRQ4pXJ3aFUKmptYAAMDSKictVzti9/olqiRl89ZWVlhezsR/u9b3p02dnZsLKqvXfQEtUkNXpPw93dHTdu3MDt27flLqVUeXl5phPhNZWVlRXc3WvfVSZENVGNDg2lUgkPDw+5yyhTUlISmjdvLncZRESS1OjDU0REVLkYGkREJBlDg4iIJFOIivaiZ8YK+3YiIiLpOnXqJHnaGhUaRERUtXh4ioiIJGNoEBGRZAwNIiKSjKFBRESSMTSIiEgyhgYREUnG0JDJkiVL0K9fP4SEhGDixIlFeuONiYlBQEAAAgMDcejQIRmrlM++ffsQFBQEb29vnD59usg4ts8/fv75ZwQGBiIgIACrV6+WuxzZzZw5E35+fggODjYNu3v3LsaMGYO+fftizJgxyMrKkrHCGkCQLA4dOiR0Op0QQogPP/xQfPjhh0IIIS5cuCBCQkKERqMR165dE3369BF6vV7OUmVx8eJFcenSJTF69GiRmJhoGs72+Yderxd9+vQR165dExqNRoSEhIgLFy7IXZasjh07Js6cOSOCgoJMw5YsWSJiYmKEEELExMSY3mv0cLinIZNnn30WlpYFnQy3b98eaWlpAID4+HgEBQXB2toanp6eaNq0KRITE+UsVRZeXl548skniw1n+/wjMTERTZs2haenJ6ytrREUFIT4+Hi5y5JVly5dUK9evSLD4uPjERYWBgAICwvD/v375SitxmBomIHt27ejZ8+eAID09HS4ubmZxrm6uiI9PV2u0swO2+cfbAtpbt++DRcXFwCAi4sL7ty5I3NFj7ca/XsacouIiEBGRkax4ZGRkXj++ecBAKtWrYKFhQVCQ0MBAKKEXl0UCkXVFioTKe3zoNrUPuVhW5AcGBpVaMOGDWWOj4uLw08//YQNGzaY3uxubm6mQ1VAwbfJwm9JNU157VOS2tQ+5WFbSNOgQQOoVCq4uLhApVKhfv36cpf0WOPhKZn8/PPPWLNmDVatWlXk5179/f2xd+9eaLVapKSk4MqVK/D19ZWxUvPC9vlH27ZtceXKFaSkpECr1WLv3r3w9/eXuyyz4+/vj507dwIAdu7ciT59+shc0eONvdzKJCAgAFqtFk5OTgCAdu3aYcGCBQAKDllt374dFhYWmDVrFnr16iVnqbL44YcfsHDhQty5cweOjo7w8fHBunXrALB97nfw4EG8//77MBgMGDJkCCZMmCB3SbKKiorCsWPHkJmZiQYNGmDSpEl4/vnnERkZiZs3b6Jx48ZYsWKF6X1HFcfQICIiyXh4ioiIJGNoEBGRZAwNIiKSjKFBRESSMTSIiEgyhgZRNRg7diw6d+6M1157Te5SiB4J7wgnqgbjxo1DXl4etmzZIncpRI+EexpElSgxMREhISHQaDRQq9UICgrCX3/9BT8/P9SpU0fu8ogeGfc0iCqRr68v/P39sXz5cuTn5yM0NBStWrWSuyyiSsM9DaJKNnHiRBw+fBhnzpzBuHHj5C6HqFIxNIgqWVZWFtRqNXJzc6HRaOQuh6hSMTSIKtmcOXPw5ptvIiQkBMuWLZO7HKJKxXMaRJVo586dsLS0REhICAwGA0aOHInffvsN0dHRuHz5MtRqNXr27IlFixahR48ecpdLVGHs5ZaIiCTj4SkiIpKMoUFERJIxNIiISDKGBhERScbQICIiyRgaREQkGUODiIgkY2gQEZFk/w8azTaSWfufOQAAAABJRU5ErkJggg==\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "a = sns.scatterplot(x=\"x1\", y=\"x2\", hue=\"data_class\", style=\"data_class\", data=pl_df)\n", "plt.title(\"t-SNE: document feature words with binary term-frequency\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The inverse-document values for the feature words, and the actual terms" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1. , 1. , 2.57059808, 2.90707032, 1. ,\n", " 2.67068154, 2.57059808, 2.57059808])" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_mod.fit_transform(docs_words)\n", "tfidf_mod.idf_" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1. , 4.92197334, 4.92197334, 4.92197334, 4.92197334])" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_mod.fit_transform(docs_value)\n", "tfidf_mod.idf_[0:5]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }