{ "cells": [ { "cell_type": "markdown", "id": "banned-position", "metadata": {}, "source": [ "# Building a Harmonized Commonsense Knowledge Graph" ] }, { "cell_type": "code", "execution_count": 1, "id": "dying-negotiation", "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "sys.path.insert(0,'..')\n", "from kgtk.configure_kgtk_notebooks import ConfigureKGTK\n", "\n", "from kgtk.functions import kgtk, kypher" ] }, { "cell_type": "code", "execution_count": 2, "id": "bright-snapshot", "metadata": {}, "outputs": [], "source": [ "# Parameters\n", "\n", "kgtk_path = \"/Users/filipilievski/mcs/kgtk\"\n", "\n", "tutorial_deployment_path = \"/Users/filipilievski/mcs/kgtk-tutorial-files/datasets\"\n", "project_deployment_path = tutorial_deployment_path + \"/arnold-network-analysis\"\n", "\n", "# Folder on local machine where to create the output and temporary folders\n", "input_path = \"/Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense\"\n", "output_path = \"/Users/filipilievski/mcs/kgtk-projects\"\n", "project_name = \"building-commonsense-knowledge-graph\"" ] }, { "cell_type": "code", "execution_count": 34, "id": "perfect-eagle", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "User home: /Users/filipilievski\n", "Current dir: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph\n", "KGTK dir: /Users/filipilievski/mcs/kgtk\n", "Use-cases dir: /Users/filipilievski/mcs/kgtk/use-cases\n" ] } ], "source": [ "files = [\n", " \"conceptnet\", \n", " \"vg_graphs\", \n", " \"vg_synsets\", \n", " \"atomic\",\n", " \"mapping_lex\",\n", " \"mapping_cnfn\"\n", "]\n", "\n", "ck = ConfigureKGTK(files, kgtk_path=kgtk_path)\n", "ck.configure_kgtk(input_graph_path=input_path,\n", " output_path=output_path,\n", " project_name=project_name)" ] }, { "cell_type": "code", "execution_count": 35, "id": "suffering-borough", "metadata": {}, "outputs": [], "source": [ "os.environ[\"conceptnet\"]=\"%s/conceptnet-assertions-5.7.0.csv\" % os.environ[\"GRAPH\"]\n", "os.environ[\"vg_graphs\"]=\"%s/visualgenome/scene_graphs.json\" % os.environ[\"GRAPH\"]\n", "os.environ[\"vg_synsets\"]=\"%s/visualgenome/attribute_synsets.json\" % os.environ[\"GRAPH\"]\n", "os.environ[\"atomic\"]=\"%s/v4_atomic_all_agg.csv\" % os.environ[\"GRAPH\"]\n", "os.environ[\"mapping_lex\"]=\"%s/mappings/lexical_mappings.tsv\" % os.environ[\"GRAPH\"]\n", "os.environ[\"mapping_cnfn\"]=\"%s/mappings/mapping_fn_cn.tsv\" % os.environ[\"GRAPH\"]\n", "\n", "os.environ['kgtk_path'] = kgtk_path\n", "os.environ['KGTK_GRAPH_CACHE'] = os.environ['STORE']\n", "os.environ['KGTK_OPTION_DEBUG'] = \"false\"" ] }, { "cell_type": "code", "execution_count": 36, "id": "frequent-multiple", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "USE_CASES_DIR: /Users/filipilievski/mcs/kgtk/use-cases\n", "GRAPH: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense\n", "TEMP: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph\n", "kgtk: kgtk\n", "EXAMPLES_DIR: /Users/filipilievski/mcs/kgtk/examples\n", "OUT: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph\n", "kypher: kgtk query --graph-cache /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph/wikidata.sqlite3.db\n", "STORE: /Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph/temp.building-commonsense-knowledge-graph/wikidata.sqlite3.db\n", "conceptnet: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/conceptnet-assertions-5.7.0.csv\n", "vg_graphs: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/visualgenome/scene_graphs.json\n", "vg_synsets: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/visualgenome/attribute_synsets.json\n", "atomic: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/v4_atomic_all_agg.csv\n", "mapping_lex: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/mappings/lexical_mappings.tsv\n", "mapping_cnfn: /Users/filipilievski/mcs/kgtk-tutorial-files/datasets/common-sense/mappings/mapping_fn_cn.tsv\n" ] } ], "source": [ "ck.print_env_variables()" ] }, { "cell_type": "markdown", "id": "seeing-separation", "metadata": {}, "source": [ "Load all my files into the kypher cache so that all graph aliases are defined" ] }, { "cell_type": "code", "execution_count": 37, "id": "previous-chosen", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 3 µs, sys: 0 ns, total: 3 µs\n", "Wall time: 6.2 µs\n" ] } ], "source": [ "%%time\n", "#ck.load_files_into_cache()" ] }, { "cell_type": "code", "execution_count": 38, "id": "approximate-floating", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/filipilievski/mcs/kgtk-projects/building-commonsense-knowledge-graph\n" ] } ], "source": [ "%cd {os.environ['OUT']}" ] }, { "cell_type": "markdown", "id": "ancient-cedar", "metadata": {}, "source": [ "## 1. Import individual graphs in KGTK\n", "\n", "We will first import the individual resources in KGTK format:\n", "1. ConceptNet\n", "2. FrameNet\n", "3. Visual Genome\n", "4. ATOMIC" ] }, { "cell_type": "code", "execution_count": 8, "id": "thick-albany", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.92 s, sys: 572 ms, total: 2.49 s\n", "Wall time: 3min 33s\n" ] } ], "source": [ "%%time\n", "# Import ConceptNet\n", "!kgtk import_conceptnet --english_only -i $conceptnet -o $TEMP/kgtk_conceptnet.tsv" ] }, { "cell_type": "code", "execution_count": 10, "id": "purple-adelaide", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[nltk_data] Downloading package framenet_v17 to\n", "[nltk_data] /Users/filipilievski/nltk_data...\n", "[nltk_data] Package framenet_v17 is already up-to-date!\n", "CPU times: user 290 ms, sys: 94.8 ms, total: 384 ms\n", "Wall time: 27.1 s\n" ] } ], "source": [ "%%time\n", "# Import FrameNet\n", "!kgtk import-framenet -o $TEMP/kgtk_framenet.tsv" ] }, { "cell_type": "code", "execution_count": 13, "id": "partial-proof", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 442 ms, sys: 141 ms, total: 583 ms\n", "Wall time: 40.7 s\n" ] } ], "source": [ "%%time\n", "# Import Visual Genome\n", "!kgtk import-visualgenome -i $vg_graphs --attr-synsets $vg_synsets \\\n", " -o $TEMP/kgtk_visualgenome.tsv" ] }, { "cell_type": "code", "execution_count": 15, "id": "subsequent-profile", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 149 ms, sys: 54.7 ms, total: 204 ms\n", "Wall time: 14.1 s\n" ] } ], "source": [ "%%time\n", "# Import ATOMIC\n", "!kgtk import_atomic -i $atomic -o $TEMP/kgtk_atomic.tsv " ] }, { "cell_type": "markdown", "id": "hairy-orleans", "metadata": {}, "source": [ "## 2. Combine sources" ] }, { "cell_type": "markdown", "id": "addressed-sentence", "metadata": {}, "source": [ "We will first concatenate the sources to create `cskg_base.tsv`:" ] }, { "cell_type": "code", "execution_count": 20, "id": "wrong-powder", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 896 ms, sys: 280 ms, total: 1.18 s\n", "Wall time: 1min 18s\n" ] } ], "source": [ "%%time\n", "!kgtk cat -i $TEMP/kgtk_atomic.tsv $TEMP/kgtk_conceptnet.tsv $TEMP/kgtk_framenet.tsv $TEMP/kgtk_visualgenome.tsv \\\n", "/ sort -c 'node1,relation,node2' \\\n", "/ add_id --id-style node1-label-node2-num \\\n", "/ reorder_columns --columns id ... -o $TEMP/cskg_base.tsv" ] }, { "cell_type": "markdown", "id": "satisfactory-gender", "metadata": {}, "source": [ "Let's see what we get from simple concatenation:" ] }, { "cell_type": "code", "execution_count": 24, "id": "executive-hygiene", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/filipilievski/opt/anaconda3/envs/scenegen/lib/python3.8/site-packages/IPython/core/magic.py:187: DtypeWarning: Columns (7,9) have mixed types.Specify dtype option on import or set low_memory=False.\n", " call = lambda f, *a, **k: f(*a, **k)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1min 12s, sys: 21.9 s, total: 1min 34s\n", "Wall time: 1min 25s\n" ] }, { "data": { "text/html": [ "
\n", " | id | \n", "node1 | \n", "relation | \n", "node2 | \n", "node1;label | \n", "node2;label | \n", "relation;label | \n", "relation;dimension | \n", "source | \n", "sentence | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "/c/en/0-/r/DefinedAs-/c/en/empty_set-0000 | \n", "/c/en/0 | \n", "/r/DefinedAs | \n", "/c/en/empty_set | \n", "0 | \n", "empty set | \n", "defined as | \n", "NaN | \n", "CN | \n", "[[0]] is the [[empty set]]. | \n", "
1 | \n", "/c/en/0-/r/DefinedAs-/c/en/first_limit_ordinal... | \n", "/c/en/0 | \n", "/r/DefinedAs | \n", "/c/en/first_limit_ordinal | \n", "0 | \n", "first limit ordinal | \n", "defined as | \n", "NaN | \n", "CN | \n", "[[0]] is the [[first limit ordinal]]. | \n", "
2 | \n", "/c/en/0-/r/DefinedAs-/c/en/number_zero-0000 | \n", "/c/en/0 | \n", "/r/DefinedAs | \n", "/c/en/number_zero | \n", "0 | \n", "number zero | \n", "defined as | \n", "NaN | \n", "CN | \n", "[[0]] is the [[number zero]]. | \n", "
3 | \n", "/c/en/0-/r/HasContext-/c/en/internet_slang-0000 | \n", "/c/en/0 | \n", "/r/HasContext | \n", "/c/en/internet_slang | \n", "0 | \n", "internet slang | \n", "has context | \n", "NaN | \n", "CN | \n", "NaN | \n", "
4 | \n", "/c/en/0-/r/HasProperty-/c/en/pronounced_zero-0000 | \n", "/c/en/0 | \n", "/r/HasProperty | \n", "/c/en/pronounced_zero | \n", "0 | \n", "pronounced zero | \n", "has property | \n", "NaN | \n", "CN | \n", "[[\\0\\\"]] is [[pronounced zero]]\" | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
6773218 | \n", "wn:zucchini.n.01-mw:MayHaveProperty-wn:steamed... | \n", "wn:zucchini.n.01 | \n", "mw:MayHaveProperty | \n", "wn:steamed.s.01 | \n", "zucchini | \n", "steamed | \n", "may have property | \n", "NaN | \n", "VG | \n", "NaN | \n", "
6773219 | \n", "wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... | \n", "wn:zucchini.n.01 | \n", "mw:MayHaveProperty | \n", "wn:yellow.s.01 | \n", "zucchini squash | \n", "yellow | \n", "may have property | \n", "NaN | \n", "VG | \n", "NaN | \n", "
6773220 | \n", "wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... | \n", "wn:zucchini.n.01 | \n", "mw:MayHaveProperty | \n", "wn:yellow.s.01 | \n", "zucchini | \n", "yellow | \n", "may have property | \n", "NaN | \n", "VG | \n", "NaN | \n", "
6773221 | \n", "wn:zucchini.n.01-mw:MayHaveProperty-wn:yellow.... | \n", "wn:zucchini.n.01 | \n", "mw:MayHaveProperty | \n", "wn:yellow.s.01 | \n", "zucchini | \n", "yellow | \n", "may have property | \n", "NaN | \n", "VG | \n", "NaN | \n", "
6773222 | \n", "wn:zwieback.n.01-/r/LocatedNear-wn:elephant.n.... | \n", "wn:zwieback.n.01 | \n", "/r/LocatedNear | \n", "wn:elephant.n.01 | \n", "rusk | \n", "elephant | \n", "of | \n", "NaN | \n", "VG | \n", "NaN | \n", "
6773223 rows × 10 columns
\n", "\n", " | node1 | \n", "label | \n", "node2 | \n", "path | \n", "segment | \n", "
---|---|---|---|---|---|
0 | \n", "politician | \n", "antonym | \n", "honest | \n", "p0 | \n", "p0-0-0 | \n", "
1 | \n", "honest | \n", "antonym | \n", "lie | \n", "p0 | \n", "p0-1-1 | \n", "
2 | \n", "politician | \n", "antonym | \n", "honest | \n", "p1 | \n", "p1-0-2 | \n", "
3 | \n", "honest | \n", "distinct from | \n", "lie | \n", "p1 | \n", "p1-1-3 | \n", "
4 | \n", "politician | \n", "capable of | \n", "lie | \n", "p2 | \n", "p2-0-4 | \n", "
5 | \n", "politician | \n", "is a | \n", "human | \n", "p3 | \n", "p3-0-5 | \n", "
6 | \n", "human | \n", "at location | \n", "lie | \n", "p3 | \n", "p3-1-6 | \n", "
7 | \n", "politician | \n", "related to | \n", "liar | \n", "p4 | \n", "p4-0-7 | \n", "
8 | \n", "liar | \n", "etymologically related to | \n", "lie | \n", "p4 | \n", "p4-1-8 | \n", "
9 | \n", "politician | \n", "related to | \n", "lying | \n", "p5 | \n", "p5-0-9 | \n", "
10 | \n", "lying | \n", "etymologically related to | \n", "lie | \n", "p5 | \n", "p5-1-10 | \n", "