{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Labeling" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This pipeline labels each cluster." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pandas import read_pickle, DataFrame\n", "import geopandas\n", "import pickle\n", "import random\n", "import numpy as np\n", "from wikiparse import pipeline_utils as utils" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "from wikiparse import config\n", "\n", "xml_filename = config.xml\n", "scratch_folder = Path(config.folder)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import time\n", "pipeline_start = time.time()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "gdf = read_pickle(scratch_folder/'gdf_clusters.pkl')\n", "gdf['tf_idf'] = gdf.tf_idf.astype('float64')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | word | \n", "tf | \n", "article | \n", "df | \n", "tf_idf | \n", "geometry | \n", "lon | \n", "lat | \n", "cluster_193 | \n", "cluster_4000 | \n", "cluster_25000 | \n", "cluster_7 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
12007105 | \n", "geographical | \n", "1 | \n", "Hinksland | \n", "5314 | \n", "0.000188 | \n", "POINT (-28.31667 71.71667) | \n", "-28.316667 | \n", "71.716667 | \n", "71 | \n", "2821 | \n", "246 | \n", "0 | \n", "
15027457 | \n", "building | \n", "2 | \n", "Coughanour Apartment Block | \n", "108310 | \n", "0.000018 | \n", "POINT (-116.93444 44.07583) | \n", "-116.934444 | \n", "44.075833 | \n", "186 | \n", "3347 | \n", "4267 | \n", "0 | \n", "
1918673 | \n", "compliment | \n", "1 | \n", "Calverstown | \n", "70 | \n", "0.014286 | \n", "POINT (-6.79797 53.08252) | \n", "-6.797970 | \n", "53.082520 | \n", "136 | \n", "2035 | \n", "15961 | \n", "5 | \n", "
19769052 | \n", "more | \n", "1 | \n", "Arksey | \n", "99816 | \n", "0.000010 | \n", "POINT (-1.12560 53.55530) | \n", "-1.125600 | \n", "53.555300 | \n", "191 | \n", "389 | \n", "18444 | \n", "5 | \n", "
18340572 | \n", "category | \n", "1 | \n", "Mehdiabad, Semirom | \n", "449186 | \n", "0.000002 | \n", "POINT (51.37361 31.68833) | \n", "51.373611 | \n", "31.688333 | \n", "169 | \n", "157 | \n", "22862 | \n", "2 | \n", "