{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "data and combinators.ipynb", "provenance": [], "collapsed_sections": [], "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "markdown", "metadata": { "id": "K7rYEk8ef3dA", "colab_type": "text" }, "source": [ "This notebook is made to help you explore how combinators and data are connected. I've only spent about an hour on this, so probably things will break..." ] }, { "cell_type": "code", "metadata": { "id": "Rkk9lHBfzMXi", "colab_type": "code", "colab": {} }, "source": [ "#get some initial tools\n", "import pandas as pd\n", "import io\n", "import numpy as np\n", "!pip install geopandas\n", "import geopandas as gpd" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "Yom0LZkugID_", "colab_type": "text" }, "source": [ "I scraped the latest version of the page from the back-end of the dataarc site that previews the combinators, data sources and queries and saved it as a csv and put it on github." ] }, { "cell_type": "code", "metadata": { "id": "767qeo8S1ZJ1", "colab_type": "code", "colab": {} }, "source": [ "# Pull the combinators from github and make it a dataframe\n", "url = 'https://raw.githubusercontent.com/ropitz/experiments/master/data/dataarc_combinators.csv'\n", "combine = pd.read_csv(url,encoding='utf-8')\n", "combine\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "BFTNjhwI0wGO", "colab_type": "code", "colab": {} }, "source": [ "# Pull the per source data from github and make it a dataframe\n", "url = 'https://raw.githubusercontent.com/ropitz/experiments/master/data/SEAD.json'\n", "\n", "SEAD = gpd.read_file(url)\n", "\n", "SEAD.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8mdaKNbd2mkI", "colab_type": "code", "colab": {} }, "source": [ "# Pull the per source data from github and make it a dataframe\n", "url = \"https://raw.githubusercontent.com/ropitz/experiments/master/data/enviro_threats.json\"\n", "threats = gpd.read_file(url)\n", "threats.head()\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "04BYOLYs2seH", "colab_type": "code", "colab": {} }, "source": [ "\n", "# Pull the per source data from github and make it a dataframe\n", "url = \"https://raw.githubusercontent.com/ropitz/experiments/master/data/sagas.geojson\"\n", "sagas = gpd.read_file(url)\n", "sagas.head()\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "SJtWb6bU9z4J", "colab_type": "code", "colab": {} }, "source": [ "# Fill in NaNs because they make lots of tools grumpy\n", "combine=combine.fillna('blank')\n", "combine.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "HwiBJqTqELGB", "colab_type": "code", "colab": {} }, "source": [ "# reshape the combinator dataframe to split lists of concepts in a single column into separate rows\n", "reshaped = \\\n", "(combine.set_index(combine.columns.drop('topics',1).tolist())\n", " .topics.str.split(',', expand=True)\n", " .stack()\n", " .reset_index()\n", " .rename(columns={0:'topics'})\n", " .loc[:, combine.columns]\n", ")\n", "\n", "# view the result\n", "reshaped.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QKRO9lkQMIf3", "colab_type": "code", "colab": {} }, "source": [ "# strip leading and trailing whitespace because it confuses me (and makes concept labels not match)\n", "reshaped['topics'] = reshaped['topics'].str.strip()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EZv95PDqZL6_", "colab_type": "code", "colab": {} }, "source": [ "# get the list of datasources actually mapped to combinators\n", "reshaped['data'].unique()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NrvHkQ0ncCo9", "colab_type": "code", "colab": {} }, "source": [ "# get topics mapped per source\n", "sagatopics = reshaped[reshaped['data']=='sagas']\n", "sagatopics.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yQylEh_dZraS", "colab_type": "code", "colab": {} }, "source": [ "# get topics mapped per source\n", "seadtopics = reshaped[reshaped['data']=='sead']\n", "seadtopics.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "IfPLIUxTE2hJ", "colab_type": "code", "colab": {} }, "source": [ "# get topics mapped per source\n", "threatstopics = reshaped[reshaped['data']=='enviro_threats_icelandic_sites']\n", "threatstopics.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "adqUAfzSjMQj", "colab_type": "code", "colab": {} }, "source": [ "# libraries to make less-fancy graphs compared with the real tool\n", "\n", "import networkx as nx\n", "import matplotlib.pyplot as plt\n", " \n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "31FSbo42aJ8k", "colab_type": "code", "colab": {} }, "source": [ "# let's compare the topics mapped together by sagas and sead\n", "sagasead = pd.concat([seadtopics,sagatopics])\n", "\n", "# Build your graph of the selected datasets\n", "G1 = nx.from_pandas_edgelist(sagasead, 'data', 'topics')\n", "\n", "#graph it\n", "plt.figure(figsize=(10,10))\n", "\n", "# 2. Create a layout for our nodes \n", "layout = nx.draw_spring(G1, with_labels=True)\n", "\n", "\n", "\n", "# 4. Turn off the axis because I know you don't want it\n", "plt.axis('off')\n", "\n", "# 5. Tell matplotlib to show it\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8VHl060MBWyb", "colab_type": "code", "colab": {} }, "source": [ "# things with two degrees or neighbours are those between the two data sources\n", "degree = pd.DataFrame.from_dict(G1.degree())\n", "degree.columns = ['topic', 'nn']\n", "degree.sort_values('nn')\n", "shared = degree[degree['nn']==2]\n", "shared" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "112Yc7-Lc5-1", "colab_type": "code", "colab": {} }, "source": [ "# get combinators associated with a given shared topic\n", "building = sagasead[sagasead['topics']=='building']\n", "building" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qqjPk4hxcm2H", "colab_type": "code", "colab": {} }, "source": [ "#get the queries attached to those combinators\n", "\n", "pd.set_option('display.max_colwidth', -1)\n", "print(building['query'])" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "J3cT8qlJhjXZ", "colab_type": "code", "colab": {} }, "source": [ "# count how many items in the sagas data mention 'wood' as a relevant concept\n", "building_sagas = sagas[sagas['concept'].str.contains('wood', regex=True)]\n", "building_sagas.count()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "hIG_DJzihtLH", "colab_type": "code", "colab": {} }, "source": [ "# count how many items in the sead data mention 'general synanthropic' as a relevant concept\n", "building_sead = SEAD[SEAD['indicators'].str.contains('General synanthropic', regex=False)]\n", "\n", "building_sead.count()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FEZ7FoGMDqRz", "colab_type": "code", "colab": {} }, "source": [ "# now look at things between all three data sources\n", "sagaseadthreats = pd.concat([seadtopics,sagatopics,threatstopics], sort=False)\n", "\n", "# Build your graph of the selected datasets\n", "G2 = nx.from_pandas_edgelist(sagaseadthreats, 'data', 'topics')\n", "\n", "#graph it\n", "plt.figure(figsize=(10,10))\n", "\n", "# 2. Create a layout for our nodes \n", "layout = nx.draw_spring(G2, with_labels=True)\n", "\n", "\n", "\n", "# 4. Turn off the axis because I know you don't want it\n", "plt.axis('off')\n", "\n", "# 5. Tell matplotlib to show it\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "KLn1yEqRHgP3", "colab_type": "code", "colab": {} }, "source": [ "# get topics shared by all three data sources\n", "degree2 = pd.DataFrame.from_dict(G2.degree())\n", "degree2.columns = ['topic', 'nn']\n", "degree2.sort_values('nn')\n", "shared2 = degree2[degree2['nn']==3]\n", "shared2" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kkpF3pI7Hu7E", "colab_type": "code", "colab": {} }, "source": [ "# get topics shared by any two of three data sources\n", "degree2 = pd.DataFrame.from_dict(G2.degree())\n", "degree2.columns = ['topic', 'nn']\n", "degree2.sort_values('nn')\n", "shared3 = degree2[degree2['nn']==2]\n", "shared3" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jxDJs-jdM9SE", "colab_type": "code", "colab": {} }, "source": [ "# Count how many mappings of combinators to each concept (topic) is present and append that value to each row in the table\n", "reshaped['Count'] = reshaped.groupby('topics')['topics'].transform('size')\n", "reshaped.head()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Df0RGpj7R96l", "colab_type": "code", "colab": {} }, "source": [ "# find frequently mapped concepts. Here you can change the number '10' to be something else\n", "# the groupby the count\n", "many = reshaped[reshaped.Count > 10]\n", "top = many.groupby('Count')\n", "top.first()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oQ06J4ZFULfR", "colab_type": "code", "colab": {} }, "source": [ "# find infrequently mapped concepts\n", "few = reshaped[reshaped.Count < 3]\n", "bottom = few.groupby('Count')\n", "bottom.first()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "RKgDwTrgGVMo", "colab_type": "code", "colab": {} }, "source": [ "# group the whole big table by concept / topic and pull the first response from each group\n", "bytop = reshaped.groupby('topics')\n", "bytop.first()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jZhGlNhOKMGD", "colab_type": "code", "colab": {} }, "source": [ "# example of how to get a group by concept name\n", "bytop.get_group('Dung')" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "3Kqu8-evLXvd", "colab_type": "code", "colab": {} }, "source": [ "# example of how to get a group by concept name and assign it to a variable\n", "dung = bytop.get_group('Dung')\n", "dung" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "MSqPFCgJMSuc", "colab_type": "code", "colab": {} }, "source": [ "# example of how to get a group by concept name and assign it to a variable\n", "animals = bytop.get_group('animal')\n", "animals" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "mCF4gQXPWbup", "colab_type": "code", "colab": {} }, "source": [ "\n" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "x4xoPMwULqUT", "colab_type": "code", "colab": {} }, "source": [ "# Build your graph from the data sources and combinators connected under a single concept\n", "G3 = nx.from_pandas_edgelist(animals, 'data', 'comb')\n", "\n", "# Plot it\n", "nx.draw_circular(G3, with_labels=True)\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "EmGmlh3NNB9f", "colab_type": "code", "colab": {} }, "source": [ "#combine two topics you think should connect\n", "ani_dung = pd.concat([animals,dung])\n", "ani_dung" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "o9AUz7TKMigw", "colab_type": "code", "colab": {} }, "source": [ "# Build your graph from the data sources and combinators connected under a two concepts you think should connect\n", "G4 = nx.from_pandas_edgelist(ani_dung, 'data', 'comb')\n", "\n", "# Plot it\n", "nx.draw_circular(G4, with_labels=True)\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qpZgpRXNW2yG", "colab_type": "code", "colab": {} }, "source": [ "# Build your graph from the most frequently mapped topics table\n", "G5 = nx.from_pandas_edgelist(many, 'data', 'topics')\n", "\n", "# Plot it\n", "nx.draw_circular(G5, with_labels=True)\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "B8893iEpZTAF", "colab_type": "code", "colab": {} }, "source": [ "#select specific topics\n", "concepts = ['animal','humans']\n", "selected = reshaped[reshaped.topics.isin(concepts)]\n", "\n", "# Build your graph of the relationships of your selected topics\n", "G6 = nx.from_pandas_edgelist(selected, 'data', 'topics')\n", "\n", "# Plot it\n", "nx.draw_circular(G6, with_labels=True)\n", "plt.figure(figsize=(15, 15))\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ftkx8cCSd1OE", "colab_type": "code", "colab": {} }, "source": [ "#select specific datasets\n", "datasets = ['sead','enviro_threats_icelandic_sites']\n", "selected2 = reshaped[reshaped.data.isin(datasets)]\n", "\n", "# Build your graph of the selected datasets\n", "G7 = nx.from_pandas_edgelist(selected2, 'data', 'topics')\n", "\n", "#graph it\n", "plt.figure(figsize=(10,10))\n", "\n", "# 2. Create a layout for our nodes \n", "layout = nx.draw_spring(G7, with_labels=True)\n", "\n", "\n", "\n", "# 4. Turn off the axis because I know you don't want it\n", "plt.axis('off')\n", "\n", "# 5. Tell matplotlib to show it\n", "plt.show()" ], "execution_count": 0, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "pbpzPgIEhi6J", "colab_type": "text" }, "source": [ "It seems enviro threats and sead only share three concepts... this is probably less than ideal" ] }, { "cell_type": "code", "metadata": { "id": "gFffcWAE1H6U", "colab_type": "code", "colab": {} }, "source": [ "!pip install pyvis\n", "from pyvis.network import Network" ], "execution_count": 0, "outputs": [] } ] }