{ "cells": [ { "cell_type": "markdown", "id": "banned-position", "metadata": {}, "source": [ "# Building a Harmonized Commonsense Knowledge Graph" ] }, { "cell_type": "code", "execution_count": 1, "id": "dying-negotiation", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "sys.path.insert(0,'..')\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 2, "id": "bright-snapshot", "metadata": {}, "outputs": [], "source": [ "# Parameters\n", "\n", "kgtk_path = \"/Users/filipilievski/mcs/kgtk\"\n", "\n", "tutorial_deployment_path = \"/Users/filipilievski/mcs/kgtk-tutorial-files/datasets\"\n", "project_deployment_path = tutorial_deployment_path + \"/arnold-network-analysis\"\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense\"\n", "output_path = \"/Users/filipilievski/mcs/kgtk-projects\"\n", "project_name = \"building-commonsense-knowledge-graph\"" ] }, { "cell_type": "code", "execution_count": 34, "id": "perfect-eagle", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /Users/filipilievski\n", "Current dir: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph\n", "KGTK dir: /Users/filipilievski/mcs/kgtk\n", "Use-cases dir: /Users/filipilievski/mcs/kgtk/use-cases\n" ] } ], "source": [ "files = [\n", " \"conceptnet\", \n", " \"vg_graphs\", \n", " \"vg_synsets\", \n", " \"atomic\",\n", " \"mapping_lex\",\n", " \"mapping_cnfn\"\n", "]\n", "\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 35, "id": "suffering-borough", "metadata": {}, "outputs": [], "source": [ "os.environ[\"conceptnet\"]=\"%s/conceptnet-assertions-5.7.0.csv\" % os.environ[\"GRAPH\"]\n", "os.environ[\"vg_graphs\"]=\"%s/visualgenome/scene_graphs.json\" % os.environ[\"GRAPH\"]\n", "os.environ[\"vg_synsets\"]=\"%s/visualgenome/attribute_synsets.json\" % os.environ[\"GRAPH\"]\n", "os.environ[\"atomic\"]=\"%s/v4_atomic_all_agg.csv\" % os.environ[\"GRAPH\"]\n", "os.environ[\"mapping_lex\"]=\"%s/mappings/lexical_mappings.tsv\" % os.environ[\"GRAPH\"]\n", "os.environ[\"mapping_cnfn\"]=\"%s/mappings/mapping_fn_cn.tsv\" % os.environ[\"GRAPH\"]\n", "\n", "os.environ['kgtk_path'] = kgtk_path\n", "os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']\n", "os.environ['KGTK_OPTION_DEBUG'] = \"false\"" ] }, { "cell_type": "code", "execution_count": 36, "id": "frequent-multiple", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "USE_CASES_DIR: /Users/filipilievski/mcs/kgtk/use-cases\n", "GRAPH: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense\n", "TEMP: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph\n", "kgtk: kgtk\n", "EXAMPLES_DIR: /Users/filipilievski/mcs/kgtk/examples\n", "OUT: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph\n", "kypher: kgtk query --graph-cache /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph/wikidata.sqlite3.db\n", "STORE: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph/wikidata.sqlite3.db\n", "conceptnet: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/conceptnet-assertions-5.7.0.csv\n", "vg_graphs: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/visualgenome/scene_graphs.json\n", "vg_synsets: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/visualgenome/attribute_synsets.json\n", "atomic: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/v4_atomic_all_agg.csv\n", "mapping_lex: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/mappings/lexical_mappings.tsv\n", "mapping_cnfn: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/mappings/mapping_fn_cn.tsv\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "markdown", "id": "seeing-separation", "metadata": {}, "source": [ "Load all my files into the kypher cache so that all graph aliases are defined" ] }, { "cell_type": "code", "execution_count": 37, "id": "previous-chosen", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\n", "Wall time: 6.2 µs\n" ] } ], "source": [ "%%time\n", "#ck.load_files_into_cache()" ] }, { "cell_type": "code", "execution_count": 38, "id": "approximate-floating", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph\n" ] } ], "source": [ "%cd {os.environ['OUT']}" ] }, { "cell_type": "markdown", "id": "ancient-cedar", "metadata": {}, "source": [ "## 1. Import individual graphs in KGTK\n", "\n", "We will first import the individual resources in KGTK format:\n", "1. ConceptNet\n", "2. FrameNet\n", "3. Visual Genome\n", "4. ATOMIC" ] }, { "cell_type": "code", "execution_count": 8, "id": "thick-albany", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.92 s, sys: 572 ms, total: 2.49 s\n", "Wall time: 3min 33s\n" ] } ], "source": [ "%%time\n", "# Import ConceptNet\n", "!kgtk import_conceptnet --english_only -i $conceptnet -o $TEMP/kgtk_conceptnet.tsv" ] }, { "cell_type": "code", "execution_count": 10, "id": "purple-adelaide", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package framenet_v17 to\n", "[nltk_data] /Users/filipilievski/nltk_data...\n", "[nltk_data] Package framenet_v17 is already up-to-date!\n", "CPU times: user 290 ms, sys: 94.8 ms, total: 384 ms\n", "Wall time: 27.1 s\n" ] } ], "source": [ "%%time\n", "# Import FrameNet\n", "!kgtk import-framenet -o $TEMP/kgtk_framenet.tsv" ] }, { "cell_type": "code", "execution_count": 13, "id": "partial-proof", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 442 ms, sys: 141 ms, total: 583 ms\n", "Wall time: 40.7 s\n" ] } ], "source": [ "%%time\n", "# Import Visual Genome\n", "!kgtk import-visualgenome -i $vg_graphs --attr-synsets $vg_synsets \\\n", " -o $TEMP/kgtk_visualgenome.tsv" ] }, { "cell_type": "code", "execution_count": 15, "id": "subsequent-profile", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 149 ms, sys: 54.7 ms, total: 204 ms\n", "Wall time: 14.1 s\n" ] } ], "source": [ "%%time\n", "# Import ATOMIC\n", "!kgtk import_atomic -i $atomic -o $TEMP/kgtk_atomic.tsv " ] }, { "cell_type": "markdown", "id": "hairy-orleans", "metadata": {}, "source": [ "## 2. Combine sources" ] }, { "cell_type": "markdown", "id": "addressed-sentence", "metadata": {}, "source": [ "We will first concatenate the sources to create `cskg_base.tsv`:" ] }, { "cell_type": "code", "execution_count": 20, "id": "wrong-powder", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 896 ms, sys: 280 ms, total: 1.18 s\n", "Wall time: 1min 18s\n" ] } ], "source": [ "%%time\n", "!kgtk cat -i $TEMP/kgtk_atomic.tsv $TEMP/kgtk_conceptnet.tsv $TEMP/kgtk_framenet.tsv $TEMP/kgtk_visualgenome.tsv \\\n", "/ sort -c 'node1,relation,node2' \\\n", "/ add_id --id-style node1-label-node2-num \\\n", "/ reorder_columns --columns id ... -o $TEMP/cskg_base.tsv" ] }, { "cell_type": "markdown", "id": "satisfactory-gender", "metadata": {}, "source": [ "Let's see what we get from simple concatenation:" ] }, { "cell_type": "code", "execution_count": 24, "id": "executive-hygiene", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/filipilievski/opt/anaconda3/envs/scenegen/lib/python3.8/site-packages/IPython/core/magic.py:187: DtypeWarning: Columns (7,9) have mixed types.Specify dtype option on import or set low_memory=False.\n", " call = lambda f, *a, **k: f(*a, **k)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 12s, sys: 21.9 s, total: 1min 34s\n", "Wall time: 1min 25s\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnode1relationnode2node1;labelnode2;labelrelation;labelrelation;dimensionsourcesentence
0/c/en/0-/r/DefinedAs-/c/en/empty_set-0000/c/en/0/r/DefinedAs/c/en/empty_set0empty setdefined asNaNCN[[0]] is the [[empty set]].
1/c/en/0-/r/DefinedAs-/c/en/first_limit_ordinal.../c/en/0/r/DefinedAs/c/en/first_limit_ordinal0first limit ordinaldefined asNaNCN[[0]] is the [[first limit ordinal]].
2/c/en/0-/r/DefinedAs-/c/en/number_zero-0000/c/en/0/r/DefinedAs/c/en/number_zero0number zerodefined asNaNCN[[0]] is the [[number zero]].
3/c/en/0-/r/HasContext-/c/en/internet_slang-0000/c/en/0/r/HasContext/c/en/internet_slang0internet slanghas contextNaNCNNaN
4/c/en/0-/r/HasProperty-/c/en/pronounced_zero-0000/c/en/0/r/HasProperty/c/en/pronounced_zero0pronounced zerohas propertyNaNCN[[\\0\\\"]] is [[pronounced zero]]\"
.................................
6773218wn:zucchini.n.01-mw:MayHaveProperty-wn:steamed...wn:zucchini.n.01mw:MayHavePropertywn:steamed.s.01zucchinisteamedmay have propertyNaNVGNaN
6773219wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow....wn:zucchini.n.01mw:MayHavePropertywn:yellow.s.01zucchini squashyellowmay have propertyNaNVGNaN
6773220wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow....wn:zucchini.n.01mw:MayHavePropertywn:yellow.s.01zucchiniyellowmay have propertyNaNVGNaN
6773221wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow....wn:zucchini.n.01mw:MayHavePropertywn:yellow.s.01zucchiniyellowmay have propertyNaNVGNaN
6773222wn:zwieback.n.01-/r/LocatedNear-wn:elephant.n....wn:zwieback.n.01/r/LocatedNearwn:elephant.n.01ruskelephantofNaNVGNaN
\n", "

6773223 rows × 10 columns

\n", "
" ], "text/plain": [ " id node1 \\\n", "0 /c/en/0-/r/DefinedAs-/c/en/empty_set-0000 /c/en/0 \n", "1 /c/en/0-/r/DefinedAs-/c/en/first_limit_ordinal... /c/en/0 \n", "2 /c/en/0-/r/DefinedAs-/c/en/number_zero-0000 /c/en/0 \n", "3 /c/en/0-/r/HasContext-/c/en/internet_slang-0000 /c/en/0 \n", "4 /c/en/0-/r/HasProperty-/c/en/pronounced_zero-0000 /c/en/0 \n", "... ... ... \n", "6773218 wn:zucchini.n.01-mw:MayHaveProperty-wn:steamed... wn:zucchini.n.01 \n", "6773219 wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... wn:zucchini.n.01 \n", "6773220 wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... wn:zucchini.n.01 \n", "6773221 wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... wn:zucchini.n.01 \n", "6773222 wn:zwieback.n.01-/r/LocatedNear-wn:elephant.n.... wn:zwieback.n.01 \n", "\n", " relation node2 node1;label \\\n", "0 /r/DefinedAs /c/en/empty_set 0 \n", "1 /r/DefinedAs /c/en/first_limit_ordinal 0 \n", "2 /r/DefinedAs /c/en/number_zero 0 \n", "3 /r/HasContext /c/en/internet_slang 0 \n", "4 /r/HasProperty /c/en/pronounced_zero 0 \n", "... ... ... ... \n", "6773218 mw:MayHaveProperty wn:steamed.s.01 zucchini \n", "6773219 mw:MayHaveProperty wn:yellow.s.01 zucchini squash \n", "6773220 mw:MayHaveProperty wn:yellow.s.01 zucchini \n", "6773221 mw:MayHaveProperty wn:yellow.s.01 zucchini \n", "6773222 /r/LocatedNear wn:elephant.n.01 rusk \n", "\n", " node2;label relation;label relation;dimension source \\\n", "0 empty set defined as NaN CN \n", "1 first limit ordinal defined as NaN CN \n", "2 number zero defined as NaN CN \n", "3 internet slang has context NaN CN \n", "4 pronounced zero has property NaN CN \n", "... ... ... ... ... \n", "6773218 steamed may have property NaN VG \n", "6773219 yellow may have property NaN VG \n", "6773220 yellow may have property NaN VG \n", "6773221 yellow may have property NaN VG \n", "6773222 elephant of NaN VG \n", "\n", " sentence \n", "0 [[0]] is the [[empty set]]. \n", "1 [[0]] is the [[first limit ordinal]]. \n", "2 [[0]] is the [[number zero]]. \n", "3 NaN \n", "4 [[\\0\\\"]] is [[pronounced zero]]\" \n", "... ... \n", "6773218 NaN \n", "6773219 NaN \n", "6773220 NaN \n", "6773221 NaN \n", "6773222 NaN \n", "\n", "[6773223 rows x 10 columns]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%%time\n", "kgtk(\"\"\"\n", " cat -i $TEMP/cskg_base.tsv\n", "\"\"\")" ] }, { "cell_type": "markdown", "id": "competent-record", "metadata": {}, "source": [ "As a second attempt, we add mappings between nodes and we use them to deduplicate" ] }, { "cell_type": "code", "execution_count": 40, "id": "latest-worship", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.57 s, sys: 506 ms, total: 2.08 s\n", "Wall time: 2min 26s\n" ] } ], "source": [ "%%time\n", "## Concatenate mappings\n", "!kgtk cat -i $TEMP/kgtk_atomic.tsv $TEMP/kgtk_conceptnet.tsv $TEMP/kgtk_framenet.tsv $TEMP/kgtk_visualgenome.tsv \\\n", "$mapping_lex $mapping_cnfn \\\n", "/ sort -c 'node1,relation,node2' \\\n", "/ compact --columns node1 relation node2 -o $TEMP/kgtk_compact_quoted.tsv\n" ] }, { "cell_type": "code", "execution_count": 48, "id": "peripheral-teach", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\trelation\tnode2\tnode1;label\tnode2;label\trelation;label\trelation;dimension\tsource\tsentence\tid\n", "/c/en/0\t/r/DefinedAs\t/c/en/empty_set\t\"0\"\t\"empty set\"\t\"defined as\"\t\t\"CN\"\t\"[[0]] is the [[empty set]].\"\t\n", "/c/en/0\t/r/DefinedAs\t/c/en/first_limit_ordinal\t\"0\"\t\"first limit ordinal\"\t\"defined as\"\t\t\"CN\"\t\"[[0]] is the [[first limit ordinal]].\"\t\n", "/c/en/0\t/r/DefinedAs\t/c/en/number_zero\t\"0\"\t\"number zero\"\t\"defined as\"\t\t\"CN\"\t\"[[0]] is the [[number zero]].\"\t\n", "/c/en/0\t/r/HasContext\t/c/en/internet_slang\t\"0\"\t\"internet slang\"\t\"has context\"\t\t\"CN\"\t\t\n", "/c/en/0\t/r/HasProperty\t/c/en/pronounced_zero\t\"0\"\t\"pronounced zero\"\t\"has property\"\t\t\"CN\"\t\"[[\\\"0\\\"]] is [[pronounced zero]]\"\t\n", "/c/en/0\t/r/IsA\t/c/en/set_containing_one_element\t\"0\"\t\"set containing one element\"\t\"is a\"\t\t\"CN\"\t\"[[{0}]] is a type of [[set containing one element]].\"\t\n", "/c/en/0\t/r/RelatedTo\t/c/en/1\t\"0\"\t\"1\"\t\"related to\"\t\t\"CN\"\t\t\n", "/c/en/0\t/r/RelatedTo\t/c/en/2\t\"0\"\t\"2\"\t\"related to\"\t\t\"CN\"\t\t\n", "/c/en/0.22_inch_calibre\t/r/IsA\t/c/en/5.6_millimetres\t\"0.22 inch calibre\"\t\"5.6 millimetres\"\t\"is a\"\t\t\"CN\"\t\"[[0.22 inch calibre]] is [[5.6 millimetres]]\"\t\n" ] } ], "source": [ "!head $TEMP/kgtk_compact_quoted.tsv" ] }, { "cell_type": "code", "execution_count": 49, "id": "palestinian-scanner", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.82 s, sys: 590 ms, total: 2.41 s\n", "Wall time: 2min 30s\n" ] } ], "source": [ "%%time\n", "# Deduplicate\n", "!kgtk connected_components -i $TEMP/kgtk_compact_quoted.tsv \\\n", "--properties mw:SameAs --cluster-name-method lowest \\\n", "/ lift --columns-to-lift node1 node2 --lift-suffix= \\\n", "--input-file $TEMP/kgtk_compact_quoted.tsv --label-file - \\\n", "--label-select-value connected_component \\\n", "/ filter --invert -p ';mw:SameAs;' \\\n", "/ compact --columns node1 relation node2 --presorted False \\\n", "/ add_id --id-style node1-label-node2-num / \\\n", "reorder_columns --columns id ... -o $OUT/cskg.tsv.gz" ] }, { "cell_type": "markdown", "id": "super-shannon", "metadata": {}, "source": [ "## 3. Analyzing the Commonsense Knowledge Graph" ] }, { "cell_type": "markdown", "id": "prerequisite-passenger", "metadata": {}, "source": [ "We first compute statistics of the graph:" ] }, { "cell_type": "code", "execution_count": 52, "id": "numeric-soldier", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "objc[24619]: Class GNotificationCenterDelegate is implemented in both /Users/filipilievski/opt/anaconda3/envs/scenegen/lib/libgio-2.0.0.dylib (0x12bdd09b0) and /usr/local/opt/glib/lib/libgio-2.0.0.dylib (0x14f2f22f8). One of the two will be used. Which one is undefined.\n", "CPU times: user 1.41 s, sys: 478 ms, total: 1.89 s\n", "Wall time: 2min 1s\n" ] } ], "source": [ "%%time\n", "!kgtk graph_statistics -i $OUT/cskg.tsv.gz \\\n", "--degrees --hits --pagerank --statistics-only \\\n", "--log $TEMP/summary.txt -o $TEMP/statistics.tsv" ] }, { "cell_type": "markdown", "id": "526712ed-4290-4e73-a0d5-a7a20e7fa362", "metadata": {}, "source": [ "Let's see the summarized information about the graph nodes, relations, and centrality metrics." ] }, { "cell_type": "code", "execution_count": 54, "id": "surgical-longitude", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "graph loaded! It has 2102795 nodes and 4481705 edges\n", "\n", "###Top relations:\n", "/r/RelatedTo\t1703951\n", "/r/FormOf\t378859\n", "/r/DerivedFrom\t325374\n", "/r/HasContext\t232935\n", "/r/IsA\t231424\n", "/r/Synonym\t222155\n", "/r/LocatedNear\t157204\n", "at:xAttr\t133281\n", "at:xWant\t129171\n", "at:xEffect\t100307\n", "\n", "###Degrees:\n", "in degree stats: mean=2.131309, std=0.020009, max=1\n", "out degree stats: mean=2.131309, std=0.007436, max=1\n", "total degree stats: mean=4.262617, std=0.022866, max=1\n", "\n", "###PageRank\n", "Max pageranks\n", "14095\t/c/en/entity/n/wn\t0.004315\n", "13917\t/c/en/abstraction/n/wn\t0.003083\n", "319950\t/c/en/physical_entity/n/wn\t0.001992\n", "210\t/c/en/organic_compound\t0.001859\n", "126760\t/c/en/whole/n/wn\t0.001584\n", "\n", "###HITS\n", "HITS hubs\n", "2091640\twn:white.a.01\t0.134129\n", "2091604\twn:black.a.01\t0.126603\n", "2091632\twn:red.s.01\t0.119914\n", "2091546\twn:small.a.01\t0.114502\n", "2091608\twn:brown.s.01\t0.114064\n", "HITS auth\n", "2091583\twn:man.n.01\t0.114008\n", "2091600\twn:woman.n.01\t0.098821\n", "2091726\twn:person.n.01\t0.098466\n", "2091532\twn:sign.n.02\t0.095097\n", "2091854\twn:tree.n.01\t0.086341\n" ] } ], "source": [ "!cat $TEMP/summary.txt" ] }, { "cell_type": "code", "execution_count": 55, "id": "secret-synthesis", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "/c/en/0\tvertex_in_degree\t21\t/c/en/0-vertex_in_degree-0\n", "/c/en/0\tvertex_out_degree\t8\t/c/en/0-vertex_out_degree-1\n", "/c/en/0\tvertex_pagerank\t2.1814036156011614e-06\t/c/en/0-vertex_pagerank-2\n", "/c/en/0\tvertex_hubs\t7.957265660837697e-14\t/c/en/0-vertex_hubs-3\n", "/c/en/0\tvertex_auth\t3.18954781666107e-15\t/c/en/0-vertex_auth-4\n", "/c/en/empty_set\tvertex_in_degree\t21\t/c/en/empty_set-vertex_in_degree-5\n", "/c/en/empty_set\tvertex_out_degree\t5\t/c/en/empty_set-vertex_out_degree-6\n", "/c/en/empty_set\tvertex_pagerank\t1.743039400606911e-06\t/c/en/empty_set-vertex_pagerank-7\n", "/c/en/empty_set\tvertex_hubs\t3.6071192796944516e-15\t/c/en/empty_set-vertex_hubs-8\n" ] } ], "source": [ "!head $TEMP/statistics.tsv" ] }, { "cell_type": "markdown", "id": "varied-geography", "metadata": {}, "source": [ "Let's find paths between some node pairs:" ] }, { "cell_type": "code", "execution_count": 56, "id": "periodic-female", "metadata": {}, "outputs": [], "source": [ "%%bash\n", "cat <$TEMP/path-query.tsv\n", "node1\tnode2\tlabel\n", "/c/en/politician\t/c/en/lie\tpath\n", "/c/en/politician\t/c/en/actor\tpath\n", "/c/en/politician\t/c/en/film\tpath\n", "EOF" ] }, { "cell_type": "code", "execution_count": 60, "id": "confident-england", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "objc[24659]: Class GNotificationCenterDelegate is implemented in both /Users/filipilievski/opt/anaconda3/envs/scenegen/lib/libgio-2.0.0.dylib (0x126b339b0) and /usr/local/opt/glib/lib/libgio-2.0.0.dylib (0x14a0792f8). One of the two will be used. Which one is undefined.\n", "\n", "CPU times: user 19.7 ms, sys: 31.4 ms, total: 51 ms\n", "Wall time: 51 s\n" ] } ], "source": [ "%%time\n", "kgtk(\"\"\"\n", " paths -i $OUT/cskg.tsv.gz\n", " --verbose False\n", " --max_hops 2\n", " --statistics-only True\n", " --path-file $TEMP/path-query.tsv\n", " -o $TEMP/path-results.tsv\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 62, "id": "excited-danger", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\tid\n", "p0\t0\t/c/en/politician-/r/Antonym-/c/en/honest-0000\tp0-0-0\n", "p0\t1\t/c/en/honest-/r/Antonym-/c/en/lie-0000\tp0-1-1\n", "p1\t0\t/c/en/politician-/r/Antonym-/c/en/honest-0000\tp1-0-2\n", "p1\t1\t/c/en/honest-/r/DistinctFrom-/c/en/lie-0000\tp1-1-3\n", "p2\t0\t/c/en/politician-/r/CapableOf-/c/en/lie-0000\tp2-0-4\n", "p3\t0\t/c/en/politician-/r/IsA-/c/en/human-0000\tp3-0-5\n", "p3\t1\t/c/en/human-/r/AtLocation-/c/en/lie-0000\tp3-1-6\n", "p4\t0\t/c/en/politician-/r/RelatedTo-/c/en/liar-0000\tp4-0-7\n", "p4\t1\t/c/en/liar-/r/EtymologicallyRelatedTo-/c/en/lie-0000\tp4-1-8\n" ] } ], "source": [ "!head $TEMP/path-results.tsv" ] }, { "cell_type": "markdown", "id": "partial-demand", "metadata": {}, "source": [ "The next command uses a query to retrieve the node1, label, node2 for each of the edges in the paths. We show the results through add-labels to help us interpret the results." ] }, { "cell_type": "code", "execution_count": 65, "id": "mental-publicity", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2pathsegment
0politicianantonymhonestp0p0-0-0
1honestantonymliep0p0-1-1
2politicianantonymhonestp1p1-0-2
3honestdistinct fromliep1p1-1-3
4politiciancapable ofliep2p2-0-4
5politicianis ahumanp3p3-0-5
6humanat locationliep3p3-1-6
7politicianrelated toliarp4p4-0-7
8liaretymologically related toliep4p4-1-8
9politicianrelated tolyingp5p5-0-9
10lyingetymologically related toliep5p5-1-10
\n", "
" ], "text/plain": [ " node1 label node2 path segment\n", "0 politician antonym honest p0 p0-0-0\n", "1 honest antonym lie p0 p0-1-1\n", "2 politician antonym honest p1 p1-0-2\n", "3 honest distinct from lie p1 p1-1-3\n", "4 politician capable of lie p2 p2-0-4\n", "5 politician is a human p3 p3-0-5\n", "6 human at location lie p3 p3-1-6\n", "7 politician related to liar p4 p4-0-7\n", "8 liar etymologically related to lie p4 p4-1-8\n", "9 politician related to lying p5 p5-0-9\n", "10 lying etymologically related to lie p5 p5-1-10" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kgtk(\"\"\"\n", " query -i $OUT/cskg.tsv.gz -i $TEMP/path-results.tsv\n", " --match '\n", " path: (path)-[segment]->(edge),\n", " cskg: (n1)-[edge {relation: property}]->(n2)'\n", " --return 'n1.label as node1, property.label as label, n2.label as node2, path as path, segment as segment'\n", " --order-by 'path, segment'\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "id": "surprised-simpson", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }