{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Embeddings" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/aliases.en.tsv.gz\"\n", "ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/all.tsv.gz\"\n", "CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.tsv.gz\"\n", "DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/descriptions.en.tsv.gz\"\n", "EXAMPLES_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/examples\"\n", "GE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding\"\n", "ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.isa.tsv.gz\"\n", "ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/claims.wikibase-item.tsv.gz\"\n", "KGTK_PATH: \"/Users/pedroszekely/Documents/GitHub/kgtk\"\n", "LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/labels.en.tsv.gz\"\n", "OUT: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output\"\n", "P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279.tsv.gz\"\n", "P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/derived.P279star.tsv.gz\"\n", "PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/metadata.property.datatypes.tsv.gz\"\n", "Q154ALIAS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/aliases.en.tsv.gz\"\n", "Q154ALL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/all.tsv.gz\"\n", "Q154CLAIMS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.tsv.gz\"\n", "Q154DESCRIPTION: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/descriptions.en.tsv.gz\"\n", "Q154ISA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.isa.tsv.gz\"\n", "Q154ITEM: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/claims.wikibase-item.tsv.gz\"\n", "Q154LABEL: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/labels.en.tsv.gz\"\n", "Q154P279: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279.tsv.gz\"\n", "Q154P279STAR: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/derived.P279star.tsv.gz\"\n", "Q154PROPERTY_DATATYPES: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/metadata.property.datatypes.tsv.gz\"\n", "Q154QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.tsv.gz\"\n", "Q154QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/qualifiers.time.tsv.gz\"\n", "Q154SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/output/parts/sitelinks.tsv.gz\"\n", "QUALIFIERS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.tsv.gz\"\n", "QUALIFIERS_TIME: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/qualifiers.time.tsv.gz\"\n", "SITELINKS: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/sitelinks.tsv.gz\"\n", "STORE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n", "TE: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding\"\n", "TEMP: \"/Users/pedroszekely/Downloads/kgtk-tutorial/temp\"\n", "USECASE_DIR: \"/Users/pedroszekely/Documents/GitHub/kgtk/use-cases\"\n", "WIKIDATA: \"/Users/pedroszekely/Downloads/kgtk-tutorial/miniwikidata/\"\n", "kgtk: \"kgtk --debug\"\n", "kypher: \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kgtk-tutorial/wikidata.sqlite3.miniwikidata.db\"\n" ] } ], "source": [ "import sys \n", "sys.path.insert(0, 'tutorial')\n", "from tutorial_setup import *" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/Users/pedroszekely/Downloads/kgtk-tutorial\n" ] } ], "source": [ "%cd {output_path}" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "!cp $text_embedding_path $TE" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Graph Embeddings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normally, we would use `Q154ITEM`, but the partioning failed so we will compute it using kypher\n", "\n", "Amandeep, Jan 14, 2021: Partition succeeded, change this?" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "os.environ[\"Q154GRAPH\"] = os.environ[\"TEMP\"] + \"/Q154.edges.4.tsv.gz\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\tnode2;wikidatatype\n", "P10-P1855-Q7378-555592a4-0\tP10\tP1855\tQ7378\twikibase-item\n", "P1001-P1855-Q181574-7f428c9b-0\tP1001\tP1855\tQ181574\twikibase-item\n", "P1001-P1855-Q29868931-76b67d84-0\tP1001\tP1855\tQ29868931\twikibase-item\n", "P1001-P1855-Q3917299-6a038117-0\tP1001\tP1855\tQ3917299\twikibase-item\n", "P1001-P1855-Q7889769-df478289-0\tP1001\tP1855\tQ7889769\twikibase-item\n", "P1001-P1855-Q8901-15be5b36-0\tP1001\tP1855\tQ8901\twikibase-item\n", "P1004-P1855-Q209651-5b585f66-0\tP1004\tP1855\tQ209651\twikibase-item\n", "P1004-P1855-Q5471-0ca2dad9-0\tP1004\tP1855\tQ5471\twikibase-item\n", "P1005-P17-Q45-fa9c1f36-0\tP1005\tP17\tQ45\twikibase-item\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < \"$Q154ITEM\" | head" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 185238 764122 10113585\n" ] } ], "source": [ "!zcat < \"$Q154GRAPH\" | wc" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "!$kypher -i \"$Q154GRAPH\" -i \"$TEMP\"/Q154.metadata.property.datatype.tsv.gz -i \"$Q154LABEL\" \\\n", "--match 'edges: (n1)-[l {label: property}]->(n2), datatype: (property)-[]->(dt:`wikibase-item`), label: (n1)-[]->(lab)' \\\n", "--return 'distinct l as id, n1 as node1, l.label as label, n2 as node2' \\\n", "-o \"$GE\"/geinput.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We have over 60,000 lines:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 59331 237324 2927769 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/geinput.tsv\n" ] } ], "source": [ "!wc \"$GE\"/geinput.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Compute the graph embeddings using the default settings. Our output file `translation.txt` will be in word2vec format so we can usi it diectly in gensim" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "In Processing, Please go to /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/ge.log to check details\n", "Opening the input file: /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/geinput.tsv\n", "KgtkReader: File_path.suffix: .tsv\n", "KgtkReader: reading file /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/geinput.tsv\n", "header: id\tnode1\tlabel\tnode2\n", "node1 column found, this is a KGTK edge file\n", "KgtkReader: Special columns: node1=1 label=2 node2=3 id=0\n", "KgtkReader: Reading an edge file.\n", "Opening the output file: /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/tmp_geinput.tsv\n", "File_path.suffix: .tsv\n", "KgtkWriter: writing file /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/tmp_geinput.tsv\n", "header: id\tnode1\tlabel\tnode2\n", "Processing the input records.\n", "Processed 59330 records.\n", "Processed Finished.\n" ] } ], "source": [ "!$kgtk graph-embeddings --verbose -i \"$GE\"/geinput.tsv \\\n", "-o \"$GE\"/embeddings.txt \\\n", "--retain_temporary_data True \\\n", "--operator translation \\\n", "--workers 5 \\\n", "--log \"$GE\"/ge.log \\\n", "-T \"$GE\" \\\n", "-ot w2v \\\n", "-e 600" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at the output direcory" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 103280\n", "-rw-r--r-- 1 pedroszekely staff 46M Jan 24 13:34 embeddings.txt\n", "-rw-r--r-- 1 pedroszekely staff 949K Jan 24 13:34 ge.log\n", "-rw-r--r-- 1 pedroszekely staff 2.8M Jan 24 13:29 geinput.tsv\n", "drwxr-xr-x 10 pedroszekely staff 320B Jan 24 13:33 \u001b[34moutput\u001b[m\u001b[m\n", "-rw-r--r-- 1 pedroszekely staff 1.1M Jan 24 13:29 tmp_geinput.tsv\n" ] } ], "source": [ "!ls -hl \"$GE\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's peek at the file, we have 44K vectors of dimension 100" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "37950 100\n", "Q16963 -0.164185032 -0.385678679 -0.420757115 -0.225723416 -0.198370636 -0.639363647 -1.255813837 0.270225614 -0.272446662 0.066466041 -0.684449971 -0.646225393 0.655371726 0.400233626 -0.804983258 -0.407879442 -0.515173197 -1.094875097 -0.136672303 0.168045580 0.860949099 0.058952466 -0.393605471 0.034964170 -0.314580619 -0.375688851 0.132073671 -0.325051397 0.428311199 -0.724577427 -0.407612741 -0.259547085 0.170578405 0.244398162 0.578736067 -0.493021011 0.049207758 -0.011272416 0.574268937 -0.153083041 0.162458181 0.404403090 0.492393345 0.456173748 0.121200345 -0.969949603 0.057622679 0.437244385 -0.086911105 0.005650789 0.389287740 -0.682103992 -0.035925489 -0.142533764 0.495400161 -0.074574098 0.899632275 -0.513101518 0.876668990 -0.269911796 -0.116927855 -0.441026568 0.226159930 -0.396051615 0.553563178 0.396098971 -0.880450726 0.766135514 -0.105848886 -0.249817505 0.453038782 0.779053926 -0.202684402 -0.932523191 0.248839334 0.450294167 -0.427730024 0.355414480 0.755905151 -0.428116947 0.185254753 0.116567001 -0.184296042 -0.384050071 0.131287560 -0.611034155 -0.086580344 -0.167054191 0.608548403 -0.247129604 0.471467584 0.127500564 0.313402355 0.121378131 0.322909534 0.602852583 -1.269117355 -0.036641046 -0.184879452 0.339786947\n" ] } ], "source": [ "!head -2 \"$GE\"/embeddings.txt" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the vectors in gensim" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "path = os.environ['GE'] + \"/embeddings.txt\"\n", "ge_vectors = KeyedVectors.load_word2vec_format(path, binary=False)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-0.13319838, -0.31249306, 0.5444786 , 0.2620247 , -0.21586373,\n", " 0.4774543 , 0.35513186, 0.25343668, 0.62720376, -0.64378893,\n", " -0.1334848 , 0.37392113, -0.05861791, -0.49614078, 0.00418998,\n", " -0.38867894, -0.11065244, -0.26280305, 0.03402166, 0.6086037 ,\n", " 0.15529938, -0.47691786, 0.36543345, -1.0218014 , 0.47811472,\n", " -0.46090993, 0.69470024, 1.2210124 , 0.9414259 , -0.14485389,\n", " -0.58419716, -0.33650464, -0.00363764, 0.40011287, -0.6029851 ,\n", " -0.53706914, -0.38429427, -0.7802921 , -0.12297965, -0.33499527,\n", " -0.33098102, 0.60647637, 0.42458817, 0.26405805, -0.21734366,\n", " 0.4993593 , 0.2920877 , -0.5102409 , -0.07359102, 0.2263889 ,\n", " -0.31796598, 0.11023172, -0.24173269, 0.7149039 , 0.51946294,\n", " 0.79202014, -0.36261937, -0.925887 , 1.0097575 , 0.35304487,\n", " 0.468682 , -0.1106649 , 0.06289238, 0.30957982, 0.57368666,\n", " -0.3236156 , -0.02455906, -0.18257451, -0.01503056, -0.60159665,\n", " -0.29146332, -0.75840944, -0.11806496, -0.05531871, 0.47039658,\n", " 0.43223456, 0.30771655, 0.25710636, -0.37061483, 0.3680209 ,\n", " -0.44109678, -0.10163894, -0.08232214, 0.189794 , -0.3547559 ,\n", " 0.56447357, 1.0308425 , -0.408733 , -0.3570608 , 1.2331557 ,\n", " -0.6581403 , 0.5254171 , -0.02074503, 0.10353643, 0.16922595,\n", " 0.5411031 , -0.30705008, 0.27219167, -0.29930815, 0.08597058],\n", " dtype=float32)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q502268 is Johnnie Walker\n", "ge_vectors['Q502268']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find the most similar qnodes to `Q15874936`, the qnode for Michelob." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('P1562', 0.8587111234664917),\n", " ('Q7302072', 0.8569362163543701),\n", " ('P2174', 0.8532211184501648),\n", " ('Q2567026', 0.8531550168991089),\n", " ('Q5647008', 0.8489059209823608)]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ge_vectors.most_similar(positive=['Q15874936'], topn=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This is hard to use because the reuslt are qnodes and we have no idea what they are. Let's define a function to fetch the labels and descriptions so that we can interpret the results more easily" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "`kgtk_most_similar` is a wrapper to gensim's `most_similar` function, and it is designed to output the results in KGTK format. The `kgtk_path` is required if we want to output the labels and descriptios as this path is where the `labels.en.tsv.gz` and `descriptions.en.tsv.gz` files care stored. You can optionally provide a `output_path` to tell it to sotre the results in a file; otherwise the results will be returned as a dataframe." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def kgtk_most_similar(\n", " vectors,\n", " positive,\n", " relation_label=\"similarity_score\",\n", " kg_path=None,\n", " add_label_description=True,\n", " output_path=None,\n", " topn=25,\n", "):\n", " \"\"\"\"\"\"\n", " result = []\n", " if add_label_description and kg_path:\n", " fp = tempfile.NamedTemporaryFile(\n", " mode=\"w\", suffix=\".tsv\", delete=False, encoding=\"utf-8\"\n", " )\n", " fp.write(\"node1\\tlabel\\tnode2\\n\")\n", " for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):\n", " fp.write(\"{}\\t{}\\t{}\\n\".format(qnode, relation_label, similarity))\n", " filename = fp.name\n", " fp.close()\n", "\n", " os.environ[\"_label_graph\"] = kg_path + \"/labels.en.tsv.gz\"\n", " os.environ[\"_description_graph\"] = kg_path + \"/descriptions.en.tsv.gz\"\n", " os.environ[\"_temp_file\"] = filename\n", "\n", " result = !$kypher_raw -i \"$_label_graph\" -i \"$_description_graph\" -i \"$_temp_file\" --as sim \\\n", "--match 'sim: (n1)-[]->(similarity), label: (n1)-[]->(lab), description: (n1)-[]->(des)' \\\n", "--return 'distinct n1 as node1, similarity as node2, \"similarity\" as label, lab as `node1;label`, des as `node1;description`' \\\n", "--order-by 'cast(similarity, float) desc' \n", " \n", " os.remove(filename)\n", " \n", " else:\n", " result.append(\"node1\\tlabel\\tnode2\\n\")\n", " for (qnode, similarity) in vectors.most_similar(positive=positive, topn=topn):\n", " result.append(\"{}\\t{}\\t{}\\n\".format(qnode, relation_label, similarity))\n", "\n", " if output_path:\n", " handle = open(output_path, \"w\")\n", " for line in result:\n", " handle.write(line)\n", " handle.write(\"\\n\")\n", " handle.close()\n", " else:\n", " columns = result[0].split(\"\\t\")\n", " data = []\n", " for line in result[1:]:\n", " data.append(line.split(\"\\t\"))\n", " return pd.DataFrame(data, columns=columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's give it a try:" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0P15620.8587111234664917similarity'AllMovie title ID'@en'identifier for a work on the website AllMovie...
1P21740.8532211184501648similarity'Museum of Modern Art artist ID'@en'identifier assigned to an artist by the Museu...
2P49530.8437308073043823similarity'Library of Congress Genre/Form Terms ID'@en'ID in the Library of Congress controlled voca...
3Q23979920.8411049842834473similarity'malt liquor'@en'beer style'@en
4Q6945360.8376587629318237similarity'American whiskey'@en'Whiskey produced in the United States'@en
5Q49121820.8343247771263123similarity'Billy Beer'@en'beer produced in the United States'@en
6Q51493890.8277002573013306similarity'Colt 45'@en'malt liquor'@en
\n", "
" ], "text/plain": [ " node1 node2 label \\\n", "0 P1562 0.8587111234664917 similarity \n", "1 P2174 0.8532211184501648 similarity \n", "2 P4953 0.8437308073043823 similarity \n", "3 Q2397992 0.8411049842834473 similarity \n", "4 Q694536 0.8376587629318237 similarity \n", "5 Q4912182 0.8343247771263123 similarity \n", "6 Q5149389 0.8277002573013306 similarity \n", "\n", " node1;label \\\n", "0 'AllMovie title ID'@en \n", "1 'Museum of Modern Art artist ID'@en \n", "2 'Library of Congress Genre/Form Terms ID'@en \n", "3 'malt liquor'@en \n", "4 'American whiskey'@en \n", "5 'Billy Beer'@en \n", "6 'Colt 45'@en \n", "\n", " node1;description \n", "0 'identifier for a work on the website AllMovie... \n", "1 'identifier assigned to an artist by the Museu... \n", "2 'ID in the Library of Congress controlled voca... \n", "3 'beer style'@en \n", "4 'Whiskey produced in the United States'@en \n", "5 'beer produced in the United States'@en \n", "6 'malt liquor'@en " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q15874936 is Michelob\n", "kgtk_most_similar(ge_vectors, positive=['Q15874936'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Text embeddings" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\n", "P10-P1628-32b85d-7927ece6-0\tP10\tP1628\t\"http://www.w3.org/2006/vcard/ns#Video\"\n", "P10-P1628-acf60d-b8950832-0\tP10\tP1628\t\"https://schema.org/video\"\n", "P10-P1629-Q34508-bcc39400-0\tP10\tP1629\tQ34508\n", "P10-P1659-P1651-c4068028-0\tP10\tP1659\tP1651\n", "P10-P1659-P18-5e4b9c4f-0\tP10\tP1659\tP18\n", "P10-P1659-P4238-d21d1ac0-0\tP10\tP1659\tP4238\n", "P10-P1659-P51-86aca4c5-0\tP10\tP1659\tP51\n", "P10-P1855-Q7378-555592a4-0\tP10\tP1855\tQ7378\n", "P10-P2302-Q21502404-d012aef4-0\tP10\tP2302\tQ21502404\n", "zcat: error writing to output: Broken pipe\n" ] } ], "source": [ "!zcat < $OUT/all.tsv.gz | head" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `kgtk text-embedding` command computes sentence vectors for each Qnode in the knowledge graph. The input to this command is a sorted KGTK edge file.\n", "\n", "This is a two step process,\n", "\n", "**Create a sentence for a Qnode using user specified properties**\n", "\n", "In the command below, we have specified the following options,\n", "\n", "- `--label-properties label` specifies that the property `label` has the label for the Qnode.\n", "- `--isa-properties P31 P279 P452 P106` specified that `instance of` for the Qnode is defined by the propeties `P31 P279 P452 P106`\n", "- `--description-properties description` specifies that the property `description` has the description for the Qnode.\n", "- `--property-value P186 P17 P127 P176 P169` tells the command to use property-label and values from the properties `P186 P17 P127 P176 P169` to add additional context to sentence for the Qnode\n", "\n", "Example sentence here\n", "\n", "**Compute sentence vector using the sentence created in the previous step**\n", "\n", "The command then computes a vector for the sentence using one of the models, specified as,\n", "- `--model bert-large-nli-cls-token`\n", "\n", "For more information on this command, please [click here](https://kgtk.readthedocs.io/en/latest/analysis/text_embedding/)\n" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "!$kgtk text-embedding -i $OUT/all.tsv.gz \\\n", "--embedding-projector-metadata-path none \\\n", "--label-properties label \\\n", "--isa-properties P31 P279 P452 P106 \\\n", "--description-properties description \\\n", "--property-value P186 P17 P127 P176 P169 \\\n", "--has-properties \"\" \\\n", "-f kgtk_format \\\n", "--output-data-format kgtk_format \\\n", "--save-embedding-sentence \\\n", "--model bert-large-nli-cls-token \\\n", "-o \"$TE\" \\\n", "> \"$TE\"/text-embedding.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Duration --parallel 1\n", "15300.45 real 14970.96 user 261.58 sys" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The text embeddings are output in KGTK format and we need them in word2vec format (need to enhance the command to produce w2v format). For now, define a function to convert the KGTK embeddings to w2v format." ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def convert_kgtk_to_w2v(input_path, output_path, text_embedding_label=\"text_embedding\"):\n", " \"\"\"\n", " Convert a KGTK file (node1/label/node2) that contains embeddings to the w2v format\n", " \"\"\"\n", " vector_count = 0\n", " vector_length = 0\n", " \n", " # Read the file once to count the lines as we need to put them at the top of the w2v file\n", " with open(input_path, \"r\") as kgtk_file:\n", " next(kgtk_file)\n", " for line in kgtk_file:\n", " items = line.split(\"\\t\")\n", " qnode = items[0]\n", " label = items[1]\n", " if label == text_embedding_label:\n", " if vector_count == 0:\n", " vector_length = len(items[2].split(\",\"))\n", " vector_count += 1\n", " kgtk_file.close()\n", "\n", " with open(output_path, \"w\") as w2v_file:\n", " w2v_file.write(\"{} {}\\n\".format(vector_count, vector_length))\n", " with open(input_path, \"r\") as kgtk_file:\n", " next(kgtk_file)\n", " for line in kgtk_file:\n", " items = line.split(\"\\t\")\n", " qnode = items[0]\n", " label = items[1]\n", " if label == text_embedding_label:\n", " vector = items[2].replace(\",\", \" \")\n", " w2v_file.write(qnode + \" \" + vector)\n", " kgtk_file.close()\n", " w2v_file.close()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "convert_kgtk_to_w2v(os.environ['TE'] + \"/text-embedding.tsv\", os.environ['TE'] + \"/embeddings.txt\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at the output file, the embeddings have 1024 dimensions" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Q99970346 -0.35467714 0.11079551 -0.011766396 -0.64368856 0.7587074 -0.029240295 -0.34339845 -0.06344555 -1.6708547 0.388923 0.016877629 -0.016170679 0.17811266 1.0552806 -0.10560113 0.5062175 -0.37100965 -0.43509555 -0.7369594 0.9275887 0.6351612 -0.026170328 -0.6812031 -0.49427545 0.15076277 0.497177 -0.5669475 0.33832487 0.38121685 -0.34155178 -0.03627377 0.019129895 0.32135636 -1.3127131 0.2910208 -0.6110071 -0.21233878 -0.26547825 -0.48265418 0.19074659 -0.221765 -0.6583791 0.26793227 0.106484234 -0.51117957 -0.9209578 -0.53469723 -0.8773248 -0.50579745 0.28408417 -0.33325395 0.9733218 -0.20266499 1.2573 -0.67561316 -0.42509234 0.93198144 0.104132675 -0.72978777 0.61797714 0.5810334 0.11720219 -0.5360808 -1.2015952 0.31788793 -1.6091578 0.29825193 0.25895777 0.34890306 -0.64605564 -0.8923556 -0.6606609 0.27037627 0.13712278 0.047953844 0.9390667 -1.0347372 -1.0345485 0.6995126 0.8249064 0.35724065 0.27384388 -0.73517066 -0.35368446 -1.0148574 1.8248662 -0.07850242 1.4354324 0.5072465 -0.55955255 0.017425217 -0.14720036 -0.6255747 0.373923 0.43977597 0.51224214 0.8616229 0.3625622 0.11529137 -0.088883646 0.10349084 -0.042071007 -0.43433812 0.7515323 -0.21659058 0.8457974 0.5962904 0.3937 -0.0016398979 0.92470247 0.49142352 0.23588753 0.46617544 -0.090497404 0.42285004 0.2591153 -0.5708972 -0.06680305 -0.5744467 0.25489685 0.16582312 -0.08952185 -0.7156431 0.101445876 0.24376185 0.31326348 -0.38523293 0.51284426 -0.22892652 -0.50336355 -0.6174894 0.05824384 1.2613654 -0.93477124 0.6635647 0.26004636 -1.5386794 0.9265457 -0.25920206 0.03260179 -0.32579717 1.3067154 0.2654486 0.23924337 0.4018793 -0.36037788 -0.9476973 0.43523684 0.32782614 0.260751 -0.6636006 -0.10977579 0.58945143 -0.19258516 -0.115052864 -1.1422795 0.07531869 0.03589527 0.05722124 0.46583077 -1.0261999 0.95946 0.20920916 0.43639985 -1.2304038 0.825801 -0.832671 0.47357726 -1.4520742 0.8357687 0.28490296 -0.12491703 -1.3499368 -0.50329566 -0.11631545 -0.064047225 0.026493171 -0.5870671 0.9382678 0.67313313 -0.5828295 0.045074083 -0.09418253 0.6689779 -0.28241482 0.03886442 -0.0032574758 -0.25793675 -1.1627809 -0.14480855 0.18630067 0.6570043 0.16589165 0.78798306 0.90285397 0.050549317 -1.1613951 -0.4524023 -0.012111276 0.26893023 -0.09318724 -0.121896245 -0.70760417 0.014655143 -0.39249417 -0.18653299 -0.057589743 0.73021245 -0.056917287 0.061529838 -0.18999407 0.591215 0.37673393 -0.7288817 0.14709492 0.61839867 0.5513162 0.113904774 -0.33177403 0.019947015 0.4897275 -0.06977508 0.202893 0.45342565 0.21537605 -0.8204539 -1.1549423 0.7641429 -0.16736117 0.8525713 0.40918076 -0.21118641 -0.67439914 0.7393295 0.4042455 -0.27943966 0.4198727 0.57225925 -0.61915797 0.40726414 0.69386053 0.65220094 0.5105912 0.01765381 -0.053054214 0.074750856 1.0228095 0.6851847 0.22093888 1.0652184 0.5630041 1.2869685 0.5461491 1.4234964 -0.52450067 0.12566774 0.5065756 -0.75014585 -0.29888022 -0.9139215 -0.2644539 0.27851665 -0.1606616 -0.6418727 0.293762 1.340446 0.035954785 0.5977444 -0.56160736 -0.15174448 -0.32766417 -0.22468422 -0.7755798 -0.36355358 -0.99928457 -0.024928482 -0.0446065 -0.85236347 0.19005437 -0.2907799 0.25987476 0.9319708 -0.5595533 -0.31099772 -0.25691032 -0.6897068 0.49057207 0.14066562 0.39569032 -0.14378089 -0.5827391 -0.71003616 0.19975951 -0.09906154 -0.003160905 0.6877365 -0.18418106 -1.0390971 -0.5494717 -0.91432923 -0.12136756 0.5002397 0.08005056 -0.5283152 0.15948093 0.19238752 1.7344847 0.29803997 -0.22868207 1.1772594 0.6111734 -0.30045295 -0.45428497 0.91519463 -0.1811594 1.4227668 0.040085625 0.5106884 -0.64434844 -0.39575723 -0.07791009 -0.2930559 0.3870799 -0.28537685 0.46197322 0.02736687 1.4320319 -0.7305252 0.09027016 -0.84786594 0.43239793 -0.4064755 -0.26532996 -0.43119928 1.0265474 -0.66931665 -0.119039685 -0.32935402 0.06634794 -0.24515457 0.114274465 -0.40223724 0.2579076 -0.7495797 0.23822233 -0.5646571 0.26864943 -0.0711706 -0.35083658 -0.64066315 0.8411836 -0.36869362 0.55718195 0.17373814 -0.12910448 0.46044844 -0.33677512 0.26716956 -0.48549932 -1.0007339 0.24382466 0.55290604 -0.79105616 -0.029821448 -0.5040755 -0.40044996 -0.5883269 0.8033437 -0.3494342 0.4421048 1.0331795 -1.0313576 0.2429274 1.2627083 0.5492977 0.1957146 0.47162542 0.47001508 0.45922494 -0.8857333 0.07482535 -1.1077776 0.7690897 0.1655323 0.43297362 1.1143758 -0.31681186 0.42818367 -0.43547997 -0.14353555 -0.20837003 -0.067752525 -0.11651066 -0.37613198 -0.7305545 -1.2049621 0.07177568 0.29901513 0.028892772 -0.17387113 0.3515589 -0.2865952 0.26598048 0.6771694 0.246601 0.32268977 -0.2874596 0.8416857 -0.025081936 -0.51662207 -0.63686085 0.5857954 -0.25940505 0.12459289 -0.18832876 0.19575728 -0.9697475 -0.46878934 0.36626998 -0.35241282 -0.00017579645 0.07506092 0.005144097 0.93964106 1.0299689 0.49887708 -0.056967147 -0.03968183 0.7087462 0.39698732 0.44939226 -0.8811666 0.11488059 -0.13584064 -0.8804485 0.113515675 0.3421558 -0.115478024 0.044879816 -1.3470753 0.62091535 -0.05516497 1.2806522 0.060555544 -0.122608416 -1.5639493 0.10986894 -0.03325996 -0.61645377 -0.16821454 0.5332024 -0.6843214 0.1628791 -1.7004448 0.3838255 -0.7029188 0.38359058 -0.30008504 0.5079135 -1.1355817 -0.6294814 0.23850769 0.06211726 -0.22339958 0.34131396 0.29199523 -0.7964859 0.12275832 1.8118118 0.3901814 0.879881 0.9749075 0.81535727 0.23458743 -0.22046795 -0.1524021 0.1943603 -0.5939263 0.11492263 -0.23360129 0.7694225 0.3069532 -0.77843344 -0.36025596 -0.6601966 -0.61338955 -0.23280501 -0.5462686 -0.368946 -0.16677427 -0.52118045 -0.13719644 0.5187063 -0.39023733 0.6228797 0.6897675 -0.48699498 -0.95369184 -0.16185372 0.77282125 1.6517043 0.27241987 -0.19327924 -0.43740112 -0.28737742 0.09998034 0.011649346 -1.0341774 -0.38491687 -0.013805954 -1.2744175 -0.24279886 0.53125 0.6749148 0.5203017 -0.02930976 0.68856525 -0.4490209 -0.43842795 0.23460348 0.73722 0.20317402 0.023845531 -0.19268028 0.2968325 1.0458333 0.53261423 -0.32985854 -0.3038416 -0.72842646 0.52130556 -0.3161872 0.21191175 0.37296724 0.22269088 1.0166149 0.9571635 0.20030852 -0.85604113 -0.9525352 -0.4896618 0.27501035 0.30493072 -0.6004825 -0.13031858 -1.1572572 0.7571605 0.9155406 0.47556284 -0.16233926 0.06995824 0.028804637 1.0849707 -0.24620937 -0.08281832 -1.0116034 -0.5904093 0.72540987 0.10476458 -0.042286478 -1.009316 1.0367454 0.09546378 0.50889 -1.114861 -0.43371207 -0.94080406 0.3578365 0.48977828 -0.85720396 -0.90727335 -0.025597716 0.047332242 1.0459499 -0.1829941 -0.708357 0.3625568 -1.1661052 -1.1722999 0.8224497 0.83961487 0.04627128 -0.12110262 -0.063202746 1.1014042 -1.4770373 0.5173971 0.062148094 0.25656658 0.5633553 -0.29288998 0.72984785 0.34382886 -0.85856694 -0.029891498 0.40977788 -0.35668033 0.41098726 0.13697013 -0.17555092 -0.70149165 0.17294446 0.24821775 0.12235102 0.35104817 -1.3928343 -0.07675816 -0.3378167 -0.13961035 0.18993661 0.07654028 1.077572 0.19448721 0.60723126 -0.06350592 0.20158859 -0.06847325 -0.027891435 0.26736915 -0.7713527 -0.73258245 0.12772211 0.92170155 -0.41381544 -0.49357504 -0.39128733 -0.4044065 0.37966305 0.00044934452 0.32874373 0.28709596 -0.18956214 -0.039975997 0.8549699 -0.65253615 0.52355164 -0.96565104 -0.5743395 -0.3722807 -0.03150108 -0.014911145 -0.014916707 -0.004148706 -1.1541384 0.051338993 0.21820074 0.3031392 -0.112071134 -0.5963139 -0.057486735 -0.14307706 -0.21997774 0.4019476 -0.6419141 -0.4621579 -0.291508 -0.13578686 -0.07723715 -0.38081947 0.3856889 0.39555526 -0.31925443 0.033807147 0.10773823 1.0410087 0.31106454 0.12562117 -0.87300754 0.6254048 -0.20732975 0.1500875 -0.03283576 0.6718793 0.6746984 0.52563226 -0.43712768 0.44816944 -0.42418972 -0.382168 0.18149278 0.48498 -0.3632121 -0.19312719 0.005126953 -0.5656753 -0.17610222 0.1667601 0.75107723 -0.33547103 -0.33430102 0.5248947 -0.21227825 0.68880475 -0.6301187 -0.5990129 0.059352748 -0.21026854 -0.018346198 0.76709867 -1.3209629 0.40175483 0.22768416 1.227927 -0.35408404 -1.1328604 -0.6367485 -0.028042179 -0.17283845 0.1635134 -0.78239954 -0.14784035 -1.4512389 -0.6065351 -0.7301376 -0.0072628697 -0.7920932 0.44586283 -0.5464835 0.21501592 0.78995425 -1.0050224 -0.24485062 -0.50719905 0.57039815 0.26384816 0.46475396 -0.6080839 -0.13338824 -0.28206706 1.1951898 0.08536555 -0.10641494 1.1923056 0.26987743 -0.74653 0.5023109 -0.8631742 -0.7514744 -0.12686428 0.6020727 0.12219554 0.51043755 -0.1249993 -0.21260184 -0.5661095 0.43877488 -0.2720655 -0.93396187 -0.2125046 0.72556496 -0.20156825 -0.76425546 0.82892424 0.28360695 -0.6938852 -0.069996946 0.09017961 -0.14170301 0.68486434 -0.09074256 0.17125079 -0.028517568 -0.22832774 0.116087824 0.38604257 -0.14893727 0.2926042 -0.15221691 1.1912638 0.5653702 0.13911745 -0.35186836 -1.4950596 -0.34486777 0.06479093 0.18639444 -0.41809943 -0.4803713 0.8952167 0.63098603 0.110526174 -0.65218186 0.2683524 -0.04082667 0.0009959638 -0.05641049 -0.26886174 0.6195836 -0.5730368 -0.8562733 0.7221058 0.035127018 -0.7037818 0.42749864 -0.9320105 0.07655903 -0.060782455 0.1355398 1.1674553 0.31580466 0.08312011 -1.1988534 0.29779607 0.18888266 0.4020975 0.33224887 0.39917713 0.18641758 -0.7545361 0.5648389 -0.8855813 -0.122576885 -0.12710793 0.43654162 -0.9841464 -0.1664098 1.026072 -0.6160894 0.49772498 -1.367627 -0.33966035 0.7192099 0.7324114 -0.043000467 0.26478118 0.7066702 -1.2967288 0.33390504 -0.07434683 -0.42501277 0.46652254 0.38408503 -0.038329862 0.47667205 0.082187615 -0.5122328 -0.7771949 -0.41476575 0.85382915 0.091495916 0.5879631 0.25733235 -0.67873985 0.20353866 -0.2705537 1.5873749 0.19311693 0.4450916 0.79790246 -0.29111135 -0.48367912 0.5157689 -0.89099675 0.7663905 -0.3556041 -0.34996372 -0.3997853 0.37713566 -0.073274195 -0.84514517 0.23205864 0.6346053 0.74731314 -0.39925528 -0.19115935 0.27177122 -0.5294343 -0.92222357 -0.017529584 -0.1194042 0.1580939 -0.046560064 -0.036918297 0.25173286 -0.3944298 0.017304864 -1.1468699 1.087832 0.19128375 0.45719427 -0.14804977 0.03239102 0.27811128 1.0092936 -0.49751043 1.484579 -0.014987655 -1.0717179 -0.7890196 -0.39332515 -0.21557498 -0.21468568 0.8323991 -1.5082201 -0.057340883 -0.16075163 -0.23897143 0.8439349 -0.12952612 -0.3433772 0.63587546 0.9191563 -0.62856174 0.31177416 0.13643597 -0.6017147 -0.21591565 -0.031990267 0.12627059 0.34191674 0.28099748 -0.7526798 0.19894414 -0.8542069 -1.4542131 0.09286879 1.183819 0.84287465 0.08632176 -0.20577973 -0.2386751 -0.32922816 0.009569578 0.14021131 -0.47799504 -0.21536234 0.04925037 -0.13251936 0.5368209 3.7339559 1.5341322 0.3984081 0.077320784 0.66829884 -0.8190872 0.08726112 0.13590202 0.5784014 -0.3144412 -0.5550817 -0.05953791 -0.40570447 0.9007878 -0.4748817 0.984808 0.11988328 -0.29859447 -0.2768045 -0.30864185 -0.47983015 -0.51569194 0.005513899 -0.20185643 0.048523355 0.8853486 1.1283015 -0.8904536 0.0458796 0.39718208 -0.6860528 -0.69547224 0.38343087 0.17979084 -0.21930328 -0.28777593 -1.4000727 -0.6071461 0.35955504 0.6433447 0.0561914 -0.056774594 0.593688 -0.4545405 0.14900804 0.9974072 -1.1881998 0.06686016 -1.4754548 -0.15490429 0.17002696 -0.46552587 1.8406682 -0.1482638 -1.5476801 0.13923317 -0.16741851 0.82582164 0.43188173 -0.59867716 0.7488982 -0.40870872 0.5018854 0.1508119 0.04826939 0.80872476 0.23141569 0.24225771 -0.18301965 0.19217736 0.03458853 -0.13475391 -0.7607878 -0.25117528 0.6397927 -0.6261907 -0.37012914 -0.06436638 0.07190125 0.7561457 -0.14116026 0.66453 -0.42234984 -0.7382341 0.221739 -1.055631 0.68397975 0.380537 0.3184069 0.15672544 -0.2240313 -0.5840155 0.43963403 -0.84613836 0.06643207\n", "Q999589 -0.41698563 -0.2605745 1.261094 -0.25196022 0.6357863 -0.12980977 -1.3131291 -0.27607337 -0.030630654 0.2298155 0.03560184 -0.0151102785 0.13840826 0.19822454 -0.6997466 0.6391146 -0.61767966 0.5633287 -2.4568346 0.13103603 1.001618 0.2861246 -1.6635605 0.7095289 -1.2499253 -0.1600786 0.5834731 0.30843192 0.7016363 -0.31420374 -0.8182219 -0.3138459 0.12530687 -1.5500764 -0.025585257 -0.37329176 0.37417623 0.07642557 0.87781245 0.007800998 0.48021585 -0.33143744 0.010277603 -1.3848835 -0.42256516 -0.35301822 -0.26358312 0.11599368 -0.39688414 0.31440046 -0.43104148 -0.18601416 1.7847534 -0.6254755 0.15426129 -1.1430418 1.0998188 0.30615732 0.0684679 0.11616209 0.2664656 0.28029025 -0.20587167 -0.271835 1.3752989 -0.19037864 0.5430404 -0.043550592 1.343111 0.3730667 -1.199109 -0.30459532 -1.1949073 -0.18264028 -0.102268554 1.4176567 -0.75255847 0.32684445 1.9687995 0.13361722 0.3265424 0.020139761 0.31197467 0.10232462 -0.52171266 -0.1727716 0.11098407 1.3790847 0.0703453 -0.5760567 -0.60100526 0.1527135 0.17754105 1.1128829 0.09163578 0.7017059 0.007054135 0.04831355 0.41104016 -0.2371509 0.68035513 0.17106953 0.614042 0.1731288 0.2263185 0.03237082 0.011527993 0.22800937 0.22195967 0.24928889 0.03219877 0.42709348 0.99474907 -0.13638565 -0.15502754 -0.6848385 0.645317 0.9056839 -0.28106686 -0.0010437481 0.38340276 0.6031405 -0.99764866 0.21984494 0.546585 -0.7416573 -0.6880756 -0.06229668 0.33855593 0.45069554 -0.38447076 -0.387164 -0.27881858 -0.13945657 1.359238 -0.8260776 0.5660258 0.45407772 0.38191643 -0.09912316 0.682855 1.2686133 -0.60528994 1.2190057 -0.042755388 -0.26215506 -0.82104933 0.11482251 -0.010067672 0.3098003 -0.3515178 0.17430891 1.0119802 0.2688646 -1.0365931 -0.095464945 0.14077309 0.5187938 0.40057912 -0.3672199 1.0256016 0.015344951 -0.26640347 0.69609016 0.5526757 1.0468558 0.32396498 0.40881705 -0.26877618 0.4303556 0.75634325 0.743405 -1.2682025 -1.3122654 0.92049915 0.45285892 0.3164993 -1.4096217 0.03324192 -0.2674265 0.10269022 -0.098262906 -0.12280309 0.16387922 0.24468005 -0.15677552 0.7871597 -0.111971036 -0.6698992 -0.9810209 0.6518137 0.74468404 0.016401898 0.4759879 0.013288423 0.2067094 -1.4366583 0.55483717 -0.40957946 -0.42354318 -0.40959924 -0.25751194 -0.9830377 -0.13652878 -0.41190037 0.43262562 0.2739365 0.20523581 0.8496721 -0.89802873 -0.10527132 -0.306174 -0.6109855 0.07773691 1.4814208 -0.029923089 -0.46526563 0.15345837 -0.093968526 -0.073855706 1.3552531 0.238116 -0.6019919 0.0020439029 -0.45820257 0.21726021 0.08705235 -0.33724084 0.092232406 0.5697221 -0.5265089 0.08156489 0.15006593 -0.2721886 0.18359837 -0.37652194 1.0733501 -0.038011733 0.059356663 -0.89981794 0.54419273 -0.03247507 0.287037 1.4570675 -1.0782447 0.207394 0.13968131 -1.2044117 -0.7400688 -0.4489007 0.548577 0.6866251 0.10242751 0.67609817 -1.3158342 0.42081404 0.20943995 0.1672046 -0.6490648 -0.29139635 0.76003015 0.43652013 -0.15128106 -0.074735746 -0.98622674 0.51391816 -0.4021305 -0.78757966 0.14627434 -0.29684162 0.8969413 -0.0866652 0.33217466 0.13669558 -1.1559589 -0.25258923 0.27923164 -0.9509701 0.8675276 0.23652638 -0.33608523 -1.0413818 -0.51134753 -1.2531937 0.44972977 -0.27298364 0.29805556 -0.9436268 -0.5914968 0.191164 -0.06063785 -0.39874113 1.3315579 -0.17834479 0.52915967 0.770659 -0.25222254 -1.6531135 -0.19618742 -1.0937765 0.31875083 -0.12933807 -1.5778866 -0.047443684 0.34767342 0.6318594 -0.83044606 0.0062083825 0.7510575 0.5293969 1.2013922 -0.3337796 0.27377933 0.9533454 0.22017518 0.80124545 -0.5644125 -0.3618762 0.30592757 -0.1519308 -0.62098455 -0.67388976 0.26451135 -0.24778473 -0.21714815 0.7896843 0.65770847 -1.2898101 -0.87923384 -0.8898595 -0.057043985 -1.2496232 -0.7280505 0.6009634 0.15574685 0.44182482 -0.63706523 0.3360257 -0.13747045 -0.39582503 -0.45877388 -0.7018568 0.7483895 -0.43582135 0.15659747 -0.5828442 -0.08580815 -0.042747393 0.075657584 -0.18708709 0.1184666 -0.36333722 0.69946474 -0.5580938 0.24424697 0.5032342 0.15656532 -0.10865326 -1.0698255 0.09046641 0.45365018 -0.93023837 -1.3125207 0.087202586 0.030075599 -0.6975131 0.6074308 0.5342773 -0.010875996 0.23830655 0.2554622 -0.4284955 -0.5154948 0.4812058 0.24379727 -0.050438605 0.04820054 0.4100963 0.09930288 -1.0233239 -0.7755961 -0.21586809 0.58354414 0.5838743 1.6023781 0.3121333 -0.39950955 0.51121426 0.4400898 -0.1930556 -0.02830303 -1.0539236 0.10421469 -0.11716435 0.68616235 -0.9526248 1.386567 0.75789803 -0.6561499 -0.06340384 0.57590127 -0.5356226 0.67033577 0.4506205 -0.6942375 0.5449279 0.4903377 -0.24331416 0.2627641 0.26174444 0.033754043 0.22650683 -0.37884778 0.390966 -0.14807609 -0.1667826 0.8005286 -0.23866098 -0.52807444 0.34964865 0.14036651 0.28961158 -0.71599466 -0.67762494 1.4779916 -0.15778647 -0.23769698 -0.83473504 -0.22567499 0.3421764 -0.36925375 -0.35395026 -0.53719825 -0.07815719 0.8370267 -0.09296241 -0.7574033 0.076572746 0.10797594 0.12574697 0.64935195 -0.44840613 1.423811 0.39531618 -0.5382008 -0.9118438 0.4145623 1.0270466 0.22826368 -0.19688715 -0.35836944 -0.2801357 -0.11060271 -1.4619604 0.17182046 0.076499164 -0.26454467 -1.0869915 -0.031303167 -0.6613916 -0.19085547 -0.16120988 0.07090384 -0.17275074 0.7086498 0.7231457 -0.029719211 -0.3417165 0.1906393 -0.14756899 -0.38056695 1.0101329 0.8707222 0.21334837 0.25389972 -0.09995532 0.3894583 -0.029303223 0.09371488 0.97148126 0.9204465 -0.3341841 -0.9417707 -0.8264911 0.061182633 -1.3628155 -0.67585206 0.21165513 -0.41981477 0.4947688 -0.03987349 -0.41563147 0.034878872 -0.28041035 0.15947667 -1.1899844 0.36154824 -0.27621484 0.6036144 -0.31510454 1.9214201 0.28618437 -0.3689279 0.19574535 -0.1236457 0.52991843 -0.072093144 0.17972252 0.07804445 -0.13850422 -1.4454157 -1.1820068 0.5047012 0.56526536 -0.47382775 0.5704861 0.6502218 0.17637135 0.28209513 0.65785563 -0.7302156 -0.012197202 -0.0569197 -0.25109497 0.18622732 0.3026019 0.19212699 0.38661027 0.35030496 -1.0332546 0.5083536 0.03359413 0.3578357 -0.019704364 0.27707183 1.3152499 1.0202601 0.5519666 0.18243107 0.030475384 -1.030333 1.7312037 0.21947306 -0.40062207 -0.10302982 -0.6067782 -0.2145968 1.4325343 -0.29332092 -0.9595728 -0.20523323 -0.17057651 -0.1307008 -0.48342174 0.68958807 -0.98318917 -0.37670705 0.5289213 0.44339448 0.44847965 -0.48331672 0.28702766 -0.27533153 0.96591634 -0.20022447 0.70457 -0.8490064 -1.0372587 0.6060411 -0.05763147 -0.28221917 -0.15892443 1.1027827 0.7952349 0.07565483 -1.4554397 -0.88224596 -2.297563 -1.3204896 0.02908811 0.93184036 -0.29703236 0.97144514 -0.12628147 0.39630955 -0.6935558 1.0035774 -0.13965763 1.4473517 0.2211015 0.58594626 0.43861282 0.15987775 -1.0140587 0.05677458 0.41746974 -0.23637615 0.68535537 -0.28195035 -0.9748716 -0.8247128 1.2530512 -0.18517554 0.30638143 0.5681894 0.9780666 -0.19789681 1.0050718 0.017874084 -0.14187461 -0.6411836 0.16466254 -0.28816858 -0.1583597 0.79036397 -0.8893025 0.47741193 0.41512346 -0.2858786 -0.46181 -1.3286151 -0.38976568 0.94661814 -0.62320566 -1.3708043 -0.9427113 0.0066237785 -0.12635888 0.782666 0.04062727 -0.09126501 -0.33981606 -1.1346875 -0.05795628 0.2644347 1.0267086 0.015101653 -0.5392392 0.6434076 0.66430223 0.59949857 -0.319173 0.24504638 0.42740622 -0.91049457 -0.7575128 1.0359398 -0.13034567 -1.5267084 -0.120900705 -0.33140308 0.96910644 0.3059939 0.0655815 0.03502798 -0.31847963 -0.33234972 -1.3825029 0.40028015 0.8172537 1.1108284 -0.19258869 -0.29479375 -0.0322267 0.74054444 1.1042593 0.58904564 -0.67041147 -0.35928404 0.21565364 -0.8806634 -1.457075 0.5629418 -0.099092446 -0.62440777 -0.43788928 -0.37303922 -0.43952107 -0.049570248 -0.55202514 1.5406803 -0.106110916 -0.04853502 -0.7996796 -0.81935054 -1.0260385 1.051135 -0.5538655 -0.53924483 -0.7509523 0.009775151 0.60391265 -0.9702021 -0.4909074 -0.07958774 0.36253053 -0.40265453 -0.011047199 0.35715663 0.060167227 0.7244648 1.0168805 0.3325546 -0.61232007 0.6783144 -0.46068156 0.14938346 -0.15392706 0.78952366 0.27257147 -0.6049425 -0.81493837 -0.3794738 -0.8619466 0.04806183 0.5889089 1.179352 -0.046791583 -0.26346204 0.23862879 0.50483745 -0.37881467 -0.66196203 0.523526 0.38778275 -1.0799638 -0.41917711 -0.24591796 -0.07981938 0.52405345 -0.11932725 0.44449833 0.9566707 -0.9535339 0.34995535 0.12946798 0.17802824 -0.4598804 1.511867 -0.11805716 0.49061882 0.113263085 -0.71893364 -0.69046026 -0.69528466 -0.61103463 -0.135854 -0.5982807 0.018323097 0.720744 -0.3624173 -0.9076615 -0.50287706 0.11864436 -0.6771003 -0.22216398 0.3744266 -0.4446326 0.39284855 0.50097376 1.1283903 -0.81581765 0.4655922 0.15276735 0.80468076 -0.02826795 -0.2566071 -0.3298645 -0.46306998 0.71219593 0.79656136 0.64290327 0.15543349 -0.03186866 0.07763438 0.0829405 -0.8054604 -0.4245565 0.17173868 -0.20268935 -0.15133792 -0.12740883 1.1096537 0.47658968 -0.12754257 -0.71402395 0.7334394 0.8155744 -0.385297 0.22259201 -0.54820484 -0.030618858 1.0036105 -0.09894908 1.1648865 -0.35951397 -0.8365854 0.31833863 0.40383413 0.98814774 -0.17051646 -0.87170863 0.34654683 0.706506 0.11565839 0.70962155 -0.70647377 -0.6992337 -1.0092366 0.8193226 -0.29378715 -0.51187235 0.811944 -0.22580485 -0.28911924 0.491499 1.9577305 -0.87569654 0.1646741 -0.67404634 0.5095451 0.098696835 -0.27252513 -0.21495613 0.33412436 0.115953535 -0.89628863 -0.4935908 0.14461051 -0.84843546 -0.04892442 -0.12355792 -0.33159724 0.64501196 -0.017041892 -0.47555473 -0.17257753 1.1103094 0.061582368 0.5995048 0.34327477 0.68911874 0.033073187 -1.1746008 0.39643693 -0.2502529 -0.41312277 0.47296166 1.6480538 -0.2783256 -0.2552749 -0.38626102 0.18488327 0.40171528 1.0417678 1.2538892 -0.42058924 1.8928488 -0.9166261 -0.6101932 -0.28206688 0.21142864 -0.051227577 -0.27988762 -1.0775927 -0.3473742 0.6672785 -1.0068966 -0.33190855 1.0365989 -0.22819075 0.33808616 -0.21535753 0.6696086 -0.21825136 -0.42367753 -0.349381 0.84401655 0.9997127 0.35772163 0.66924626 -0.1364087 -1.674747 0.51613003 -0.64011896 0.69488144 0.0076886043 -0.6254111 -0.15841739 -0.9020057 -0.13470927 0.095925 0.27492046 -0.024705688 1.5063354 0.15323412 -1.0012165 0.5589497 -0.8342183 -0.54652524 0.66643244 1.9349585 0.51762956 0.33203378 -0.03700129 0.6453397 -1.3080801 -0.016910164 1.2038727 -0.11839678 0.4784217 -0.22823288 1.5318079 -0.45744428 -0.93827844 0.2676463 1.3503238 0.6048509 0.6619368 -0.70087117 -0.27358228 -0.9880295 -0.3525182 -0.8360003 -0.7482328 -1.2298892 -0.15393887 1.1013402 -1.0741335 2.7568822 0.41888562 0.2725274 -0.17313671 0.16608371 1.325989 0.0008632615 -0.72099566 -0.3346895 -0.8546148 0.09621067 -0.2968499 0.3053338 0.4646508 -0.46063763 0.57342106 0.56686497 -0.5858836 -1.8883766 -0.6319395 -1.2832683 -0.040842086 -0.049737804 0.18868297 -1.090915 0.82123494 0.589438 -0.07830721 0.2104314 -0.23387216 -0.08828615 0.23873965 0.9011001 -0.78433543 -0.21087992 -1.1274152 -1.4349767 -0.60250443 -0.047281183 -0.17517784 -0.11133793 0.090559795 0.34754017 -0.40512496 0.9051864 0.59156626 -1.0329832 -0.4962139 -0.04371301 0.01919134 0.13875732 -0.5760784 -0.047370538 -0.5824544 -2.2012436 0.38905528 0.37420228 0.3194325 0.8687238 0.6809676 0.96366096 -0.21066591 -0.13206379 -0.26229513 -0.4330789 0.024584323 0.3245967 0.047159337 -1.4672464 -0.36889616 0.3164833 0.32929796 0.13774689 -0.7509818 0.64809453 0.7814716 -0.8719781 -0.24444231 0.2373841 0.3296265 0.7604135 0.58796394 0.11584665 -0.1160713 -0.34247366 0.17066994 -1.0890346 0.8064421 -0.33539313 0.25311318 -0.8305794 0.11515608 -0.08919194 -0.25232935 0.041619558\n" ] } ], "source": [ "!head -10 \"$TE\"/embeddings.txt | tail -2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the text embeddings in gensim" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "te_path = os.environ['TE'] + \"/embeddings.txt\"\n", "te_vectors = KeyedVectors.load_word2vec_format(te_path, binary=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compare the graph and text embeddings" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most similar nodes to Johnnie Walker using the **graph embeddings**" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q6287370.8007535338401794similarity'Campbeltown Single Malts'@en'single malt Scotch whiskies distilled in the ...
1Q48653710.7964057326316833similarity'Bartlet for America'@en'episode of The West Wing (S3 E9)'@en
2P45530.7496455907821655similarity'RA Collections ID'@en'identifier for an entry in the collections da...
3Q2091350.7390897274017334similarity'East Ayrshire'@en'council area of Scotland'@en
4P14300.734946608543396similarity'OpenPlaques subject ID'@en'identifier for a person or other subject in t...
5P53610.7194418907165527similarity'BNB person ID'@en'identifier of a person in the British Nationa...
6Q9828910.7178372144699097similarity'The Macallan distillery'@en'single malt whisky distillery'@en
7Q9555890.7153750658035278similarity'Coalville'@en'town in Leicestershire, England'@en
8Q1871550.7090829014778137similarity'Tanqueray'@en'trademark'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q628737 0.8007535338401794 similarity 'Campbeltown Single Malts'@en \n", "1 Q4865371 0.7964057326316833 similarity 'Bartlet for America'@en \n", "2 P4553 0.7496455907821655 similarity 'RA Collections ID'@en \n", "3 Q209135 0.7390897274017334 similarity 'East Ayrshire'@en \n", "4 P1430 0.734946608543396 similarity 'OpenPlaques subject ID'@en \n", "5 P5361 0.7194418907165527 similarity 'BNB person ID'@en \n", "6 Q982891 0.7178372144699097 similarity 'The Macallan distillery'@en \n", "7 Q955589 0.7153750658035278 similarity 'Coalville'@en \n", "8 Q187155 0.7090829014778137 similarity 'Tanqueray'@en \n", "\n", " node1;description \n", "0 'single malt Scotch whiskies distilled in the ... \n", "1 'episode of The West Wing (S3 E9)'@en \n", "2 'identifier for an entry in the collections da... \n", "3 'council area of Scotland'@en \n", "4 'identifier for a person or other subject in t... \n", "5 'identifier of a person in the British Nationa... \n", "6 'single malt whisky distillery'@en \n", "7 'town in Leicestershire, England'@en \n", "8 'trademark'@en " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q502268 is Johnnie Walker\n", "kgtk_most_similar(ge_vectors, positive=['Q502268'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most similar nodes to Johnnie Walker using the **text embeddings**" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q2800.9379171133041382similarity'Lagavulin Distillery'@en'Scotch whisky distillery in Lagavulin, Islay,...
1Q24900310.9346836805343628similarity'William Grant & Sons'@en'Scottish company which distills Scotch whisky...
2Q15436460.9012988805770874similarity'Rob Roy'@en'cocktail based on Scotch whisky'@en
3Q3829470.8983699083328247similarity'Scotch whisky'@en'malt or grain whisky (or a blend of the two),...
4Q21685230.8907997012138367similarity'The Famous Grouse'@en'brand of Scotch whisky'@en
5Q10695020.8856704235076904similarity'Chivas Regal'@en'Blended Scotch Whisky produced by Chivas Brot...
6Q67446420.8838940858840942similarity'malt whisky'@en'Distilled spirit from Scotland (a/k/a \\\\\\\\\"Sc...
7Q48218380.8762272596359253similarity'Aultmore distillery'@en'whisky distillery in Moray, Scotland, UK'@en
8Q17549780.8664095401763916similarity'Rusty Nail'@en'cocktail mixing Drambuie and Scotch whisky'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q280 0.9379171133041382 similarity 'Lagavulin Distillery'@en \n", "1 Q2490031 0.9346836805343628 similarity 'William Grant & Sons'@en \n", "2 Q1543646 0.9012988805770874 similarity 'Rob Roy'@en \n", "3 Q382947 0.8983699083328247 similarity 'Scotch whisky'@en \n", "4 Q2168523 0.8907997012138367 similarity 'The Famous Grouse'@en \n", "5 Q1069502 0.8856704235076904 similarity 'Chivas Regal'@en \n", "6 Q6744642 0.8838940858840942 similarity 'malt whisky'@en \n", "7 Q4821838 0.8762272596359253 similarity 'Aultmore distillery'@en \n", "8 Q1754978 0.8664095401763916 similarity 'Rusty Nail'@en \n", "\n", " node1;description \n", "0 'Scotch whisky distillery in Lagavulin, Islay,... \n", "1 'Scottish company which distills Scotch whisky... \n", "2 'cocktail based on Scotch whisky'@en \n", "3 'malt or grain whisky (or a blend of the two),... \n", "4 'brand of Scotch whisky'@en \n", "5 'Blended Scotch Whisky produced by Chivas Brot... \n", "6 'Distilled spirit from Scotland (a/k/a \\\\\\\\\"Sc... \n", "7 'whisky distillery in Moray, Scotland, UK'@en \n", "8 'cocktail mixing Drambuie and Scotch whisky'@en " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q502268 is Johnnie Walker\n", "kgtk_most_similar(te_vectors, positive=['Q502268'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The graph embeddings produce poor results as the top matches are not related to whiskey. The text embeddings look much better." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most similar nodes to Michelob using the **graph embeddings**" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0P15620.8587111234664917similarity'AllMovie title ID'@en'identifier for a work on the website AllMovie...
1P21740.8532211184501648similarity'Museum of Modern Art artist ID'@en'identifier assigned to an artist by the Museu...
2P49530.8437308073043823similarity'Library of Congress Genre/Form Terms ID'@en'ID in the Library of Congress controlled voca...
3Q23979920.8411049842834473similarity'malt liquor'@en'beer style'@en
4Q6945360.8376587629318237similarity'American whiskey'@en'Whiskey produced in the United States'@en
5Q49121820.8343247771263123similarity'Billy Beer'@en'beer produced in the United States'@en
6Q51493890.8277002573013306similarity'Colt 45'@en'malt liquor'@en
\n", "
" ], "text/plain": [ " node1 node2 label \\\n", "0 P1562 0.8587111234664917 similarity \n", "1 P2174 0.8532211184501648 similarity \n", "2 P4953 0.8437308073043823 similarity \n", "3 Q2397992 0.8411049842834473 similarity \n", "4 Q694536 0.8376587629318237 similarity \n", "5 Q4912182 0.8343247771263123 similarity \n", "6 Q5149389 0.8277002573013306 similarity \n", "\n", " node1;label \\\n", "0 'AllMovie title ID'@en \n", "1 'Museum of Modern Art artist ID'@en \n", "2 'Library of Congress Genre/Form Terms ID'@en \n", "3 'malt liquor'@en \n", "4 'American whiskey'@en \n", "5 'Billy Beer'@en \n", "6 'Colt 45'@en \n", "\n", " node1;description \n", "0 'identifier for a work on the website AllMovie... \n", "1 'identifier assigned to an artist by the Museu... \n", "2 'ID in the Library of Congress controlled voca... \n", "3 'beer style'@en \n", "4 'Whiskey produced in the United States'@en \n", "5 'beer produced in the United States'@en \n", "6 'malt liquor'@en " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q15874936 is Michelob\n", "kgtk_most_similar(ge_vectors, positive=['Q15874936'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most similar nodes to Michelob using the **text embeddings**" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q20114730.9664472341537476similarity'Fantôme'@en'brand of beer'@en
1Q33155750.9586231708526611similarity'Bersalis'@en'beer brand'@en
2Q35185540.9563601016998291similarity'Floris'@en'beer brand'@en
3Q150760690.9531255960464478similarity'Marckloff'@en'beer brand'@en
4Q12773880.951164722442627similarity'Pripps Blå'@en'beer brand'@en
5Q19172550.9475076794624329similarity'St-Idesbald'@en'beer'@en
6Q2639800.9443504810333252similarity'Soproni'@en'beer mark'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q2011473 0.9664472341537476 similarity 'Fantôme'@en \n", "1 Q3315575 0.9586231708526611 similarity 'Bersalis'@en \n", "2 Q3518554 0.9563601016998291 similarity 'Floris'@en \n", "3 Q15076069 0.9531255960464478 similarity 'Marckloff'@en \n", "4 Q1277388 0.951164722442627 similarity 'Pripps Blå'@en \n", "5 Q1917255 0.9475076794624329 similarity 'St-Idesbald'@en \n", "6 Q263980 0.9443504810333252 similarity 'Soproni'@en \n", "\n", " node1;description \n", "0 'brand of beer'@en \n", "1 'beer brand'@en \n", "2 'beer brand'@en \n", "3 'beer brand'@en \n", "4 'beer brand'@en \n", "5 'beer'@en \n", "6 'beer mark'@en " ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q15874936 is Michelob\n", "kgtk_most_similar(te_vectors, positive=['Q15874936'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The graph embeddings contain some bad results, but the top matches are better as they include beers that are more closely related to Michelob. The text embeddings are reasonable as they include only beers." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most similar nodes to vodka using the **graph embeddings**" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q74680320.8597341775894165similarity'Vodka'@en'Detective Conan character'@en
1Q205776880.85808265209198similarity'.vodka'@en'top-level Internet domain'@en
2Q211897250.8241639733314514similarity'Red Eye Louie\\\\\\\\'s Vodquila'@en'blend of vodka and tequila'@en
3Q86928880.8167219161987305similarity'Category:Russian vodkas'@en'Wikimedia category'@en
4Q71518010.8038827180862427similarity'Category:Vodkas'@en'Wikimedia category'@en
5Q113280650.7957900762557983similarity'Balalaika'@en'Japanese short drink, cocktail'@en
6Q22065880.7952877283096313similarity'Caipiroska'@en'cocktail prepared with vodka'@en
7Q51344110.7506065368652344similarity'Clique'@en'Latvian vodka brand'@en
\n", "
" ], "text/plain": [ " node1 node2 label \\\n", "0 Q7468032 0.8597341775894165 similarity \n", "1 Q20577688 0.85808265209198 similarity \n", "2 Q21189725 0.8241639733314514 similarity \n", "3 Q8692888 0.8167219161987305 similarity \n", "4 Q7151801 0.8038827180862427 similarity \n", "5 Q11328065 0.7957900762557983 similarity \n", "6 Q2206588 0.7952877283096313 similarity \n", "7 Q5134411 0.7506065368652344 similarity \n", "\n", " node1;label node1;description \n", "0 'Vodka'@en 'Detective Conan character'@en \n", "1 '.vodka'@en 'top-level Internet domain'@en \n", "2 'Red Eye Louie\\\\\\\\'s Vodquila'@en 'blend of vodka and tequila'@en \n", "3 'Category:Russian vodkas'@en 'Wikimedia category'@en \n", "4 'Category:Vodkas'@en 'Wikimedia category'@en \n", "5 'Balalaika'@en 'Japanese short drink, cocktail'@en \n", "6 'Caipiroska'@en 'cocktail prepared with vodka'@en \n", "7 'Clique'@en 'Latvian vodka brand'@en " ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q374 is vodka\n", "kgtk_most_similar(ge_vectors, positive=['Q374'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Most similar nodes to vodka using the **text embeddings**" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q48692830.9598516821861267similarity'Batini'@en'vodka-based cocktail'@en
1Q35620460.9595369696617126similarity'Vodka Stinger'@en'type of cocktail'@en
2Q22065880.9436805248260498similarity'Caipiroska'@en'cocktail prepared with vodka'@en
3Q222362380.9384632110595703similarity'Mariette'@en'vodka, alcohol'@en
4Q79393170.9203516244888306similarity'Vodka Cruiser'@en'brand of vodka-based alcoholic drink'@en
5Q118025650.9155371785163879similarity'Pan Tadeusz'@en'brand of vodka'@en
6Q2680570.9129105806350708similarity'cosmopolitan'@en'cocktail made with vodka'@en
7Q47826170.9107506275177002similarity'Aqua Velva'@en'vodka and gin based cocktail'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q4869283 0.9598516821861267 similarity 'Batini'@en \n", "1 Q3562046 0.9595369696617126 similarity 'Vodka Stinger'@en \n", "2 Q2206588 0.9436805248260498 similarity 'Caipiroska'@en \n", "3 Q22236238 0.9384632110595703 similarity 'Mariette'@en \n", "4 Q7939317 0.9203516244888306 similarity 'Vodka Cruiser'@en \n", "5 Q11802565 0.9155371785163879 similarity 'Pan Tadeusz'@en \n", "6 Q268057 0.9129105806350708 similarity 'cosmopolitan'@en \n", "7 Q4782617 0.9107506275177002 similarity 'Aqua Velva'@en \n", "\n", " node1;description \n", "0 'vodka-based cocktail'@en \n", "1 'type of cocktail'@en \n", "2 'cocktail prepared with vodka'@en \n", "3 'vodka, alcohol'@en \n", "4 'brand of vodka-based alcoholic drink'@en \n", "5 'brand of vodka'@en \n", "6 'cocktail made with vodka'@en \n", "7 'vodka and gin based cocktail'@en " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q374 is vodka\n", "kgtk_most_similar(te_vectors, positive=['Q374'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The graph embeddings are noisy as the top matches include nodes not related to vodka, the text embeddings look much better." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at countries now as the differences between the two types of embeddings are more striking.\n", "The graph embeddings retrieve nodes that are related to Ireland:" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q1624750.8572713732719421similarity'County Cork'@en'county in Ireland'@en
1Q1862200.8410564661026001similarity'County Longford'@en'county in Ireland'@en
2Q1644210.838392972946167similarity'Connacht'@en'province in Ireland'@en
3Q1782830.837478756904602similarity'County Limerick'@en'county in Ireland'@en
4Q1314380.8331077098846436similarity'Munster'@en'province in Ireland'@en
5Q1844690.8307558298110962similarity'County Kerry'@en'county in Ireland'@en
6Q1845940.8298261761665344similarity'County Waterford'@en'county in Ireland'@en
7Q1786260.8277775049209595similarity'County Mayo'@en'county in Ireland'@en
8Q1847600.8252919316291809similarity'County Monaghan'@en'county in Ireland'@en
9Q63343350.8023019433021545similarity'Category:Deaths in Ireland'@en'Wikimedia category'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q162475 0.8572713732719421 similarity 'County Cork'@en \n", "1 Q186220 0.8410564661026001 similarity 'County Longford'@en \n", "2 Q164421 0.838392972946167 similarity 'Connacht'@en \n", "3 Q178283 0.837478756904602 similarity 'County Limerick'@en \n", "4 Q131438 0.8331077098846436 similarity 'Munster'@en \n", "5 Q184469 0.8307558298110962 similarity 'County Kerry'@en \n", "6 Q184594 0.8298261761665344 similarity 'County Waterford'@en \n", "7 Q178626 0.8277775049209595 similarity 'County Mayo'@en \n", "8 Q184760 0.8252919316291809 similarity 'County Monaghan'@en \n", "9 Q6334335 0.8023019433021545 similarity 'Category:Deaths in Ireland'@en \n", "\n", " node1;description \n", "0 'county in Ireland'@en \n", "1 'county in Ireland'@en \n", "2 'province in Ireland'@en \n", "3 'county in Ireland'@en \n", "4 'province in Ireland'@en \n", "5 'county in Ireland'@en \n", "6 'county in Ireland'@en \n", "7 'county in Ireland'@en \n", "8 'county in Ireland'@en \n", "9 'Wikimedia category'@en " ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q27 Ireland\n", "kgtk_most_similar(ge_vectors, positive=['Q27'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "THe text embeddings retrieve other countries:" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q1910.7966251969337463similarity'Estonia'@en'sovereign state in northeastern Europe'@en
1Q370.7891267538070679similarity'Lithuania'@en'sovereign state in northeastern Europe'@en
2Q200.7881592512130737similarity'Norway'@en'sovereign state in northern Europe'@en
3Q340.7823097109794617similarity'Sweden'@en'sovereign state in northern Europe'@en
4Q350.7809572815895081similarity'Denmark'@en'sovereign state in northern Europe that is pa...
5Q330.7614077925682068similarity'Finland'@en'sovereign state in northern Europe'@en
6Q15265380.7550898194313049similarity'Reykjavík North'@en'one of the six constituencies (kjördæmi) of I...
7Q169650190.7516392469406128similarity'North borough of Brescia'@en'one of 5 boroughs of Brescia'@en
8Q1890.7509456276893616similarity'Iceland'@en'sovereign state in Northern Europe, situated ...
9Q220.7428288459777832similarity'Scotland'@en'country in Northwest Europe, part of the Unit...
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q191 0.7966251969337463 similarity 'Estonia'@en \n", "1 Q37 0.7891267538070679 similarity 'Lithuania'@en \n", "2 Q20 0.7881592512130737 similarity 'Norway'@en \n", "3 Q34 0.7823097109794617 similarity 'Sweden'@en \n", "4 Q35 0.7809572815895081 similarity 'Denmark'@en \n", "5 Q33 0.7614077925682068 similarity 'Finland'@en \n", "6 Q1526538 0.7550898194313049 similarity 'Reykjavík North'@en \n", "7 Q16965019 0.7516392469406128 similarity 'North borough of Brescia'@en \n", "8 Q189 0.7509456276893616 similarity 'Iceland'@en \n", "9 Q22 0.7428288459777832 similarity 'Scotland'@en \n", "\n", " node1;description \n", "0 'sovereign state in northeastern Europe'@en \n", "1 'sovereign state in northeastern Europe'@en \n", "2 'sovereign state in northern Europe'@en \n", "3 'sovereign state in northern Europe'@en \n", "4 'sovereign state in northern Europe that is pa... \n", "5 'sovereign state in northern Europe'@en \n", "6 'one of the six constituencies (kjördæmi) of I... \n", "7 'one of 5 boroughs of Brescia'@en \n", "8 'sovereign state in Northern Europe, situated ... \n", "9 'country in Northwest Europe, part of the Unit... " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Q27 Ireland\n", "kgtk_most_similar(te_vectors, positive=['Q27'], kg_path=os.environ['OUT'] + \"/parts\", topn=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Using the embeddings in queries to the KG" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# Q281 whiskey\n", "# Q282 wine\n", "# Q3246609 mixed drink\n", "# Q374 vodka\n", "# Q332378 is absolut" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get the most similar nodes to **absolut**, the swedish vodka using the text embeddings and put it in a file" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# Q332378 is absolut\n", "kgtk_most_similar(te_vectors, positive=['Q332378'], kg_path=os.environ['OUT'] + \"/parts\", topn=1000, output_path=os.environ['TE'] + \"/Q332378.sim.tsv\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q73125600.9494207501411438similarity'Renat'@en'Swedish vodka'@en
1Q4061570.9068877696990967similarity'bäsk'@en'Swedish style spiced liquor'@en
2Q10340350.8990318775177002similarity'Finlandia Vodka'@en'Finnish brand of vodka'@en
3Q3740.8908253908157349similarity'vodka'@en'distilled alcoholic beverage'@en
4Q25535690.8900324106216431similarity'vodka martini'@en'cocktail made with vodka and vermouth'@en
5Q22065880.8866581916809082similarity'Caipiroska'@en'cocktail prepared with vodka'@en
6Q2680570.8860777616500854similarity'cosmopolitan'@en'cocktail made with vodka'@en
7Q40217060.8785414695739746similarity'Xan'@en'Vodka from Goygol'@en
8Q48692830.8784171938896179similarity'Batini'@en'vodka-based cocktail'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q7312560 0.9494207501411438 similarity 'Renat'@en \n", "1 Q406157 0.9068877696990967 similarity 'bäsk'@en \n", "2 Q1034035 0.8990318775177002 similarity 'Finlandia Vodka'@en \n", "3 Q374 0.8908253908157349 similarity 'vodka'@en \n", "4 Q2553569 0.8900324106216431 similarity 'vodka martini'@en \n", "5 Q2206588 0.8866581916809082 similarity 'Caipiroska'@en \n", "6 Q268057 0.8860777616500854 similarity 'cosmopolitan'@en \n", "7 Q4021706 0.8785414695739746 similarity 'Xan'@en \n", "8 Q4869283 0.8784171938896179 similarity 'Batini'@en \n", "\n", " node1;description \n", "0 'Swedish vodka'@en \n", "1 'Swedish style spiced liquor'@en \n", "2 'Finnish brand of vodka'@en \n", "3 'distilled alcoholic beverage'@en \n", "4 'cocktail made with vodka and vermouth'@en \n", "5 'cocktail prepared with vodka'@en \n", "6 'cocktail made with vodka'@en \n", "7 'Vodka from Goygol'@en \n", "8 'vodka-based cocktail'@en " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = !head \"$TE\"/Q332378.sim.tsv\n", "kgtk_to_dataframe(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Suppose I have absolut vodka and I want to make a cocktail. I can use the KG graph of the most similar nodes to absolut, and search the KG for mixed drinks (`Q3246609`) that appear in the list of most similar nodes to absolut.\n", "\n", "Here are some drinks we can make with absolut vodka. The query starts with our similarity file (`Q332378.sim.tsv`) in clause `sim` and filters it to select the qnodes that are instances of mixed drink (`Q3246609`) using clauses `isa` and `star`. Then the first `claims` clause selects those that have vodka as an ingredient (`Q374`) and the second `claims` clause retrieves the other ingredients." ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2node1;labelnode1;descriptioningredientingredient label
0Q25535690.8900324106216431'vodka martini'@en'cocktail made with vodka and vermouth'@enQ1105343'cocktail glass'@en
1Q25535690.8900324106216431'vodka martini'@en'cocktail made with vodka and vermouth'@enQ1621080'olive'@en
2Q25535690.8900324106216431'vodka martini'@en'cocktail made with vodka and vermouth'@enQ26877166'lemon twist'@en
3Q25535690.8900324106216431'vodka martini'@en'cocktail made with vodka and vermouth'@enQ26877423'dry vermouth'@en
4Q25535690.8900324106216431'vodka martini'@en'cocktail made with vodka and vermouth'@enQ374'vodka'@en
5Q22065880.8866581916809082'Caipiroska'@en'cocktail prepared with vodka'@enQ374'vodka'@en
6Q19668830.8709858655929565'Yorsh'@en'Russian drink of beer and vodka'@enQ374'vodka'@en
7Q19668830.8709858655929565'Yorsh'@en'Russian drink of beer and vodka'@enQ44'beer'@en
8Q17230600.8683922290802002'Kamikaze'@en'cocktail of vodka, triple sec and lime juice'@enQ1105343'cocktail glass'@en
9Q17230600.8683922290802002'Kamikaze'@en'cocktail of vodka, triple sec and lime juice'@enQ3539556'triple sec'@en
10Q17230600.8683922290802002'Kamikaze'@en'cocktail of vodka, triple sec and lime juice'@enQ374'vodka'@en
11Q17230600.8683922290802002'Kamikaze'@en'cocktail of vodka, triple sec and lime juice'@enQ5361217'lime juice'@en
12Q55800530.8639326095581055'Golden Russian'@en'cocktail of vodka and Galliano'@enQ1331962'Galliano'@en
13Q55800530.8639326095581055'Golden Russian'@en'cocktail of vodka and Galliano'@enQ374'vodka'@en
14Q55800530.8639326095581055'Golden Russian'@en'cocktail of vodka and Galliano'@enQ5361217'lime juice'@en
15Q80321310.8580197095870972'Woo Woo'@en'alcoholic beverage made of vodka, peach schna...Q26877133'lime wedge'@en
16Q80321310.8580197095870972'Woo Woo'@en'alcoholic beverage made of vodka, peach schna...Q26879660'peach schnapps'@en
17Q80321310.8580197095870972'Woo Woo'@en'alcoholic beverage made of vodka, peach schna...Q374'vodka'@en
18Q80321310.8580197095870972'Woo Woo'@en'alcoholic beverage made of vodka, peach schna...Q4131010'Highball glass'@en
19Q80321310.8580197095870972'Woo Woo'@en'alcoholic beverage made of vodka, peach schna...Q865448'Cranberry juice'@en
\n", "
" ], "text/plain": [ " node1 node2 node1;label \\\n", "0 Q2553569 0.8900324106216431 'vodka martini'@en \n", "1 Q2553569 0.8900324106216431 'vodka martini'@en \n", "2 Q2553569 0.8900324106216431 'vodka martini'@en \n", "3 Q2553569 0.8900324106216431 'vodka martini'@en \n", "4 Q2553569 0.8900324106216431 'vodka martini'@en \n", "5 Q2206588 0.8866581916809082 'Caipiroska'@en \n", "6 Q1966883 0.8709858655929565 'Yorsh'@en \n", "7 Q1966883 0.8709858655929565 'Yorsh'@en \n", "8 Q1723060 0.8683922290802002 'Kamikaze'@en \n", "9 Q1723060 0.8683922290802002 'Kamikaze'@en \n", "10 Q1723060 0.8683922290802002 'Kamikaze'@en \n", "11 Q1723060 0.8683922290802002 'Kamikaze'@en \n", "12 Q5580053 0.8639326095581055 'Golden Russian'@en \n", "13 Q5580053 0.8639326095581055 'Golden Russian'@en \n", "14 Q5580053 0.8639326095581055 'Golden Russian'@en \n", "15 Q8032131 0.8580197095870972 'Woo Woo'@en \n", "16 Q8032131 0.8580197095870972 'Woo Woo'@en \n", "17 Q8032131 0.8580197095870972 'Woo Woo'@en \n", "18 Q8032131 0.8580197095870972 'Woo Woo'@en \n", "19 Q8032131 0.8580197095870972 'Woo Woo'@en \n", "\n", " node1;description ingredient \\\n", "0 'cocktail made with vodka and vermouth'@en Q1105343 \n", "1 'cocktail made with vodka and vermouth'@en Q1621080 \n", "2 'cocktail made with vodka and vermouth'@en Q26877166 \n", "3 'cocktail made with vodka and vermouth'@en Q26877423 \n", "4 'cocktail made with vodka and vermouth'@en Q374 \n", "5 'cocktail prepared with vodka'@en Q374 \n", "6 'Russian drink of beer and vodka'@en Q374 \n", "7 'Russian drink of beer and vodka'@en Q44 \n", "8 'cocktail of vodka, triple sec and lime juice'@en Q1105343 \n", "9 'cocktail of vodka, triple sec and lime juice'@en Q3539556 \n", "10 'cocktail of vodka, triple sec and lime juice'@en Q374 \n", "11 'cocktail of vodka, triple sec and lime juice'@en Q5361217 \n", "12 'cocktail of vodka and Galliano'@en Q1331962 \n", "13 'cocktail of vodka and Galliano'@en Q374 \n", "14 'cocktail of vodka and Galliano'@en Q5361217 \n", "15 'alcoholic beverage made of vodka, peach schna... Q26877133 \n", "16 'alcoholic beverage made of vodka, peach schna... Q26879660 \n", "17 'alcoholic beverage made of vodka, peach schna... Q374 \n", "18 'alcoholic beverage made of vodka, peach schna... Q4131010 \n", "19 'alcoholic beverage made of vodka, peach schna... Q865448 \n", "\n", " ingredient label \n", "0 'cocktail glass'@en \n", "1 'olive'@en \n", "2 'lemon twist'@en \n", "3 'dry vermouth'@en \n", "4 'vodka'@en \n", "5 'vodka'@en \n", "6 'vodka'@en \n", "7 'beer'@en \n", "8 'cocktail glass'@en \n", "9 'triple sec'@en \n", "10 'vodka'@en \n", "11 'lime juice'@en \n", "12 'Galliano'@en \n", "13 'vodka'@en \n", "14 'lime juice'@en \n", "15 'lime wedge'@en \n", "16 'peach schnapps'@en \n", "17 'vodka'@en \n", "18 'Highball glass'@en \n", "19 'Cranberry juice'@en " ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = !$kypher -i \"$ISA\" -i \"$P279STAR\" -i \"$TE\"/Q332378.sim.tsv -i \"$Q154CLAIMS\" -i \"$Q154LABEL\" \\\n", "--match 'sim: (n1)-[]->(similarity), isa: (n1)-[]->(isa), star: (isa)-[]->(class), \\\n", " claims: (n1)-[:P186]->(:Q374), claims: (n1)-[:P186]->(ingredient), label: (ingredient)-[]->(i_label)' \\\n", "--return 'distinct n1 as node1, similarity as node2, n1.label, n1.description, \\\n", " ingredient as ingredient, i_label as `ingredient label`' \\\n", "--order-by 'cast(similarity, float) desc' \\\n", "--where 'class = \"Q3246609\"' \\\n", "--limit 20 \n", "\n", "kgtk_to_dataframe(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results are good, lots of choices of cocktails. Note that the embeddings are able to generalize from a specific vodka to vodka in general. The example also illustrates that KGTK can use the results of queries to gensim within queries to the KG.\n", "\n", "**This cell sometimes does not produce results. Seems to be randomly working?**" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When we try the query using the graph embeddings, and do not explictly filter the ingredients to include vodka:" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "# Q332378 is absolut\n", "kgtk_most_similar(ge_vectors, positive=['Q332378'], kg_path=os.environ['OUT'] + \"/parts\", topn=2000, output_path=os.environ['GE'] + \"/Q332378.sim.tsv\")" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q76332900.48373061418533325similarity'sudan'@en'traditional Korean punch'@en
1Q70846860.4789865016937256similarity'Old Pal'@en'cocktail of Canadian Rye Whiskey, Vermouth an...
2Q72716940.4633059501647949similarity'Quick Fuck'@en'layered shooter (drink)'@en
3Q24069260.44280266761779785similarity'Jägerbomb'@en'A bomb shot mixed drink'@en
4Q135270310.4344600439071655similarity'shandy'@en'family of drinks made of beer mixed with a so...
5Q330765500.41808271408081055similarity'Canadian Fashion'@en'whiskey cocktail'@en
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q7633290 0.48373061418533325 similarity 'sudan'@en \n", "1 Q7084686 0.4789865016937256 similarity 'Old Pal'@en \n", "2 Q7271694 0.4633059501647949 similarity 'Quick Fuck'@en \n", "3 Q2406926 0.44280266761779785 similarity 'Jägerbomb'@en \n", "4 Q13527031 0.4344600439071655 similarity 'shandy'@en \n", "5 Q33076550 0.41808271408081055 similarity 'Canadian Fashion'@en \n", "\n", " node1;description \n", "0 'traditional Korean punch'@en \n", "1 'cocktail of Canadian Rye Whiskey, Vermouth an... \n", "2 'layered shooter (drink)'@en \n", "3 'A bomb shot mixed drink'@en \n", "4 'family of drinks made of beer mixed with a so... \n", "5 'whiskey cocktail'@en " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = !$kypher -i \"$ISA\" -i \"$P279STAR\" -i \"$GE\"/Q332378.sim.tsv \\\n", "--match 'sim: (n1)-[]->(similarity), isa: (n1)-[]->(isa), star: (isa)-[]->(class)' \\\n", "--return 'distinct n1 as node1, similarity as node2, \"similarity\" as label, n1.label, n1.description' \\\n", "--order-by 'cast(similarity, float) desc' \\\n", "--where 'class = \"Q3246609\"' \\\n", "--limit 10 \n", "\n", "kgtk_to_dataframe(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results are poor as for the most part, the retrieved cocktails do not have vodka. Let's try the query with vodka instead of absolut vodka.\n", "\n", "**This cell sometimes does not produce results. Seems to be randomly working, same as above?**\n", "\n", "Now let get the qnodes that are similar to vodka (`Q374`) using the graph embeddings:" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "# Q374 vodka\n", "kgtk_most_similar(ge_vectors, positive=['Q374'], kg_path=os.environ['OUT'] + \"/parts\", topn=1000, output_path=os.environ['GE'] + \"/Q374.sim.tsv\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1node2labelnode1;labelnode1;description
0Q113280650.7957900762557983similarity'Balalaika'@en'Japanese short drink, cocktail'@en
1Q22065880.7952877283096313similarity'Caipiroska'@en'cocktail prepared with vodka'@en
2Q35620460.6846504211425781similarity'Vodka Stinger'@en'type of cocktail'@en
3Q39005770.6435666084289551similarity'Pertini'@en'cocktail drink with honey'@en
4Q48358710.6270689964294434similarity'BLT cocktail'@en'vodka cocktail with bacon, lettuce and tomato...
5Q268794800.5979952812194824similarity'Godmother'@en'cocktail'@en
6Q4559140.5960879325866699similarity'Vodka Red Bull'@en'alcoholic beverage'@en
7Q51035980.5952239036560059similarity'Chocolate Cake'@en'cocktail'@en
8Q54597450.5844936966896057similarity'flirtini'@en'cocktail containing vodka, champagne and pine...
9Q73678180.5656362771987915similarity'Rose Kennedy'@en'cocktail with vodka, club soda and a splash o...
\n", "
" ], "text/plain": [ " node1 node2 label node1;label \\\n", "0 Q11328065 0.7957900762557983 similarity 'Balalaika'@en \n", "1 Q2206588 0.7952877283096313 similarity 'Caipiroska'@en \n", "2 Q3562046 0.6846504211425781 similarity 'Vodka Stinger'@en \n", "3 Q3900577 0.6435666084289551 similarity 'Pertini'@en \n", "4 Q4835871 0.6270689964294434 similarity 'BLT cocktail'@en \n", "5 Q26879480 0.5979952812194824 similarity 'Godmother'@en \n", "6 Q455914 0.5960879325866699 similarity 'Vodka Red Bull'@en \n", "7 Q5103598 0.5952239036560059 similarity 'Chocolate Cake'@en \n", "8 Q5459745 0.5844936966896057 similarity 'flirtini'@en \n", "9 Q7367818 0.5656362771987915 similarity 'Rose Kennedy'@en \n", "\n", " node1;description \n", "0 'Japanese short drink, cocktail'@en \n", "1 'cocktail prepared with vodka'@en \n", "2 'type of cocktail'@en \n", "3 'cocktail drink with honey'@en \n", "4 'vodka cocktail with bacon, lettuce and tomato... \n", "5 'cocktail'@en \n", "6 'alcoholic beverage'@en \n", "7 'cocktail'@en \n", "8 'cocktail containing vodka, champagne and pine... \n", "9 'cocktail with vodka, club soda and a splash o... " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = !$kypher_raw -i isa -i p279star -i \"$GE\"/Q374.sim.tsv \\\n", "--match 'sim: (n1)-[]->(similarity), isa: (n1)-[]->(isa), star: (isa)-[]->(class)' \\\n", "--return 'distinct n1 as node1, similarity as node2, \"similarity\" as label, n1.label, n1.description' \\\n", "--order-by 'cast(similarity, float) desc' \\\n", "--where 'class = \"Q3246609\"' \\\n", "--limit 10 \n", "\n", "kgtk_to_dataframe(result)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The results are good. Somehow, the graph embeddings are able to rerieve the cocktails that have vodka, but cannot generalize from absolut vodka to vodka." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Produce files to load in the Google Embedding Projector\n", "The Goodle embedding projector (https://projector.tensorflow.org) is a tool for visualizing embeddings. To use it we need two files:\n", "\n", "- a TSV file with the vectors\n", "- a TSV file with the metadata, in the same order as the vectors\n", "\n", "We don't want to load all the vectors in the projectors because it is too many to visualize. We will load only the following types as it will be interesting to see whether they cluster properly." ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "focus_types = {\n", " \"Q3246609\": \"mixed drink\",\n", " \"Q44\": \"beer\",\n", " \"Q282\": \"wine\",\n", " \"Q281\": \"whiskey\",\n", " \"Q374\": \"vodka\",\n", " \"Q6256\": \"country\",\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To do the filteriing, we construct a dictionary that maps every q-node in the KG to the set of all its superclasses. We will use this dictionary later to tag each q-node with one of the focus types. For every q-node we will test if the focus type is in the set of all super-classes." ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "classes_result = !$kypher_raw -i \"$ISA\" -i \"$Q154CLAIMS\" -i \"$TEMP\"/Q154.descendant.tsv -i \"$P279STAR\" \\\n", "--match 'isa: (n1)-[]->(c), P279: (c)-[]->(class), claims: ()-[]->(class), descendant: (n1)-[]->()' \\\n", "--return 'distinct n1 as qnode, class as class' \n", "\n", "class_dict = {}\n", "for r in classes_result[1:]:\n", " row = r.split(\"\\t\")\n", " qnode = row[0]\n", " isa = row[1]\n", " entry = class_dict.get(qnode)\n", " if entry is None:\n", " class_dict[qnode] = set()\n", " entry = class_dict[qnode]\n", " entry.add(isa)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's look at the class_dict for Johnnie Walker (`Q502268`). We see that Johnnie Walker has many super classes." ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'Q102205',\n", " 'Q107715',\n", " 'Q11024',\n", " 'Q11028',\n", " 'Q111352',\n", " 'Q11435',\n", " 'Q1150070',\n", " 'Q1166770',\n", " 'Q11795009',\n", " 'Q1190554',\n", " 'Q1194058',\n", " 'Q12767945',\n", " 'Q131257',\n", " 'Q13878858',\n", " 'Q1400881',\n", " 'Q1422299',\n", " 'Q154',\n", " 'Q15401930',\n", " 'Q1554231',\n", " 'Q15619164',\n", " 'Q1632297',\n", " 'Q16686448',\n", " 'Q16722960',\n", " 'Q167270',\n", " 'Q1681365',\n", " 'Q16887380',\n", " 'Q16889133',\n", " 'Q1704572',\n", " 'Q174984',\n", " 'Q1786828',\n", " 'Q17988854',\n", " 'Q187931',\n", " 'Q1914636',\n", " 'Q20817253',\n", " 'Q20937557',\n", " 'Q2095',\n", " 'Q2150504',\n", " 'Q22269697',\n", " 'Q22272508',\n", " 'Q22294683',\n", " 'Q223557',\n", " 'Q23009552',\n", " 'Q23009675',\n", " 'Q2424752',\n", " 'Q246672',\n", " 'Q25481995',\n", " 'Q26907166',\n", " 'Q27166344',\n", " 'Q281',\n", " 'Q28728771',\n", " 'Q28732711',\n", " 'Q28813620',\n", " 'Q28877',\n", " 'Q2944660',\n", " 'Q29651519',\n", " 'Q2990593',\n", " 'Q2996394',\n", " 'Q309314',\n", " 'Q31464082',\n", " 'Q3249551',\n", " 'Q337060',\n", " 'Q35120',\n", " 'Q35758',\n", " 'Q3695082',\n", " 'Q382947',\n", " 'Q386724',\n", " 'Q40050',\n", " 'Q4026292',\n", " 'Q427581',\n", " 'Q42848',\n", " 'Q43460564',\n", " 'Q4373292',\n", " 'Q4406616',\n", " 'Q4437984',\n", " 'Q46737',\n", " 'Q478798',\n", " 'Q483247',\n", " 'Q488383',\n", " 'Q52948',\n", " 'Q5371079',\n", " 'Q56139',\n", " 'Q58415929',\n", " 'Q58416391',\n", " 'Q6031064',\n", " 'Q64732777',\n", " 'Q71550118',\n", " 'Q7184903',\n", " 'Q79529',\n", " 'Q80071',\n", " 'Q8171',\n", " 'Q8205328',\n", " 'Q82799',\n", " 'Q830077',\n", " 'Q837718',\n", " 'Q921513',\n", " 'Q9332',\n", " 'Q937228',\n", " 'Q99527517'}" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "class_dict['Q502268']" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "def focus_type(qnode):\n", " \"\"\"\n", " Retrieve the focus type for any qnode, and return \"other\" for nodes that are not instances of our focus types.\n", " \"\"\"\n", " for t in focus_types.keys():\n", " classes = class_dict.get(qnode)\n", " if classes and t in classes:\n", " return focus_types[t]\n", " if qnode in country_qnodes:\n", " return \"country\"\n", " return \"other\"" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "# Doesn't work because partition didin't work and we don't have the derived.isa file\n", "country_qnodes = set()\n", "!$kypher -i \"$Q154ISA\" \\\n", "--match '(n1)-[]->(:Q6256)'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Construct `country_qnodes`, the set of all country qnodes" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "country_result = !$kypher_raw -i isa -i p279star -i \"$Q154CLAIMS\" \\\n", "--match 'claims: (country)-[]->(), isa: (country)-[:isa]->(c), P279: (c)-[]->(:Q6256)' \\\n", "--return 'distinct country as country' \n", "\n", "country_qnodes = set()\n", "for r in country_result[1:]:\n", " country_qnodes.add(r)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Construct `alcoholic_qnodes`, the set of all alcoholic beverage qnodes." ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "alcoholic_qnodes = set()\n", "for line in open(os.environ[\"TEMP\"] + \"/Q154.descendant.tsv\", \"r\"):\n", " alcoholic_qnodes.add(line.split(\"\\t\")[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `build_embedding_projector_vectors` builds the vectors file, a TSV file with one line for each vector. We do this by scanning through the full embeddings file and selecting qnodes that are in our set of `alcoholic_qnodes` or `coutnry_qnodes`. We also write a file of all the qnodes that select. We will use this file later to construct the metadat file. We have to be careful to list the qnodes in the metadata file in the same order as they appear in the vectors file." ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "def build_embedding_projector_vectors(embeddings_path):\n", " input_path = embeddings_path + \"/embeddings.txt\"\n", " vectors_path = embeddings_path + \"/projector.vectors.tsv\"\n", " qnodes_path = embeddings_path + \"/projector.qnodes.tsv\"\n", "\n", " input_file = open(input_path, \"r\")\n", " vectors_file = open(vectors_path, \"w\")\n", " qnodes_file = open(qnodes_path, \"w\")\n", "\n", " qnodes_file.write(\"node1\\n\")\n", "\n", " with open(input_path, \"r\") as w2v_file:\n", " next(w2v_file)\n", " for line in w2v_file:\n", " items = line.split(\" \")\n", " qnode = items[0]\n", " if qnode in alcoholic_qnodes or qnode in country_qnodes:\n", " vectors_file.write(\"\\t\".join(items[1:]))\n", " qnodes_file.write(\"{}\\n\".format(qnode))\n", "\n", " input_file.close()\n", " vectors_file.close()\n", " qnodes_file.close()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "build_embedding_projector_vectors(os.environ[\"GE\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's take a peek at our qnodes file, which we use in the next step." ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\n", "Q2835719\n", "Q11797012\n", "Q4010098\n", "Q220275\n", "Q2945652\n", "Q3006041\n", "Q3909881\n", "Q1398382\n", "Q2835711\n" ] } ], "source": [ "!head \"$GE\"/projector.qnodes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `build_embedding_projector_metadata` uses a kypher query to retreive the labels of the qnodes (in a later version we will also include the descriptions; for now we don't because the query filters out qnodes that don't have descriptions, and unfortunaely, many alcoholic beverages are missing English descriptions).\n", "\n", "The idea is:\n", "- Retrieve the labels for all the qnodes using the kypher query. The query returns the results in arbitrary order.\n", "- Build a dictionary that maps each node to the metadata that we want.\n", "- Scan the qnodes file and for each qnode, write a metadata line in the metadata file (`projector.metadata.tsv`)\n", "\n", "Our metadata file has three columns (you can have as many as you want):\n", "- tag: includes the label and the focus type as it is often difficult to tell from the tag what type of beverage it is\n", "- qnode\n", "- focus type" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "def build_embedding_projector_metadata(embeddings_path):\n", " kg_path = os.environ[\"OUT\"] + \"/parts\"\n", " os.environ[\"_label_graph\"] = kg_path + \"/labels.en.tsv.gz\"\n", " os.environ[\"_description_graph\"] = kg_path + \"/descriptions.en.tsv.gz\"\n", " os.environ[\"_qnodes\"] = embeddings_path + \"/projector.qnodes.tsv\"\n", "\n", " #result = !$kypher_raw -i \"$_label_graph\" -i \"$_description_graph\" -i \"$_qnodes\" \\\n", " #--match 'qnodes: (n1)-[]->(), label: (n1)-[]->(lab), description: (n1)-[]->(des)' \\\n", " #--return 'distinct n1 as node1, lab as `node1;label`, des as `node1;description`' \n", " \n", " result = !$kypher_raw -i \"$_label_graph\" -i \"$_description_graph\" -i \"$_qnodes\" \\\n", " --match 'qnodes: (n1)-[]->(), label: (n1)-[]->(lab)' \\\n", " --return 'distinct n1 as node1, lab as `node1;label`'\n", " \n", " metadata_path = embeddings_path + \"/projector.metadata.tsv\"\n", " metadata_file = open(metadata_path, \"w\")\n", " metadata_file.write(\"tag\\tqnode\\ttype\\n\")\n", "\n", " qnode_dict = {}\n", " for line in result[1:]:\n", " items = line.split(\"\\t\")\n", " qnode = items[0]\n", " # qnode_dict[qnode] = \"{} ({})\".format(items[1], items[2])\n", " qnode_dict[qnode] = \"{}\".format(items[1])\n", "\n", " with open(os.environ[\"_qnodes\"]) as qnodes_file:\n", " next(qnodes_file)\n", " for line in qnodes_file:\n", " qnode = line[:-1]\n", " ftype = focus_type(qnode)\n", " tag = qnode_dict.get(qnode)\n", " if tag is None:\n", " tag = qnode\n", " tag = \"{} ({})\".format(qnode_dict.get(qnode), ftype)\n", " metadata_file.write(\"{}\\t{}\\t{}\\n\".format(tag, qnode, ftype))\n", "\n", " metadata_file.close()\n", " qnodes_file.close() " ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "build_embedding_projector_metadata(os.environ[\"GE\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check that the file sizes are correct, the metadata file has one more line as it as headers." ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2172 11418 92820 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/projector.metadata.tsv\n", " 2171 217100 2720129 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/graph-embedding/projector.vectors.tsv\n", " 4343 228518 2812949 total\n" ] } ], "source": [ "!wc \"$GE\"/projector.metadata.tsv \"$GE\"/projector.vectors.tsv" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.075612903\t-0.050318368\t0.298300803\t0.421432823\t-0.248541415\t0.056908730\t-0.680637777\t-0.073326267\t-0.377136022\t-0.134014219\t-0.197469816\t0.202630788\t-0.221844554\t-0.245265618\t0.065674670\t-0.152094126\t-0.199116051\t0.087020665\t0.324004859\t0.213313386\t-0.038387485\t-0.210035563\t0.164019927\t-0.103406079\t-0.427563787\t0.039796054\t-0.031909943\t-0.089524582\t0.020940976\t0.067550369\t-0.247933343\t0.429589391\t0.332343102\t0.304964006\t0.132957101\t-0.267977208\t0.027822251\t0.224115863\t0.144821435\t0.192092970\t-0.252484620\t-0.062621295\t0.487187892\t-0.134187669\t-0.354380071\t-0.010176005\t-0.099684849\t-0.069353126\t-0.344365478\t-0.135896817\t0.216197208\t0.475384742\t0.079261489\t0.195580140\t-0.030683421\t0.130388871\t0.029154558\t0.147579566\t0.127199963\t0.265521586\t0.290794969\t-0.006078408\t0.314150780\t-0.056377053\t0.767467082\t-0.341108173\t-0.338996470\t0.261381328\t-0.195630014\t-0.060107719\t-0.220251441\t-0.136010066\t-0.238110855\t-0.007777404\t0.293746799\t-0.038727939\t0.024510127\t0.249717876\t-0.326814592\t0.219048321\t-0.202190295\t-0.071530432\t-0.216471598\t-0.028967334\t-0.265928060\t0.128483817\t0.227668896\t-0.204059884\t0.067631401\t0.391532481\t0.100808188\t0.047810022\t-0.166893899\t-0.607921243\t0.167331889\t-0.073622622\t-0.402661622\t0.474418849\t-0.296137244\t0.047653601\n" ] } ], "source": [ "!head -1 \"$GE\"/projector.vectors.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now build the projector files for the text embeddings, and check that the sizes are ok" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "build_embedding_projector_vectors(os.environ[\"TE\"])" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "build_embedding_projector_metadata(os.environ[\"TE\"])" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2730 14366 116939 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding/projector.metadata.tsv\n", " 2729 2794496 31118184 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding/projector.vectors.tsv\n", " 2730 2730 24810 /Users/pedroszekely/Downloads/kgtk-tutorial/temp/text-embedding/projector.qnodes.tsv\n", " 8189 2811592 31259933 total\n" ] } ], "source": [ "!wc \"$TE\"/projector.metadata.tsv \"$TE\"/projector.vectors.tsv \"$TE\"/projector.qnodes.tsv" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Google embedding projector\n", "- open https://projector.tensorflow.org\n", "- Load your files using the load button\n", "- configure the visualization\n", "\n", "Here we searched on the right for absolut vodka, and we see the closest vecotrs as well as the cluster where it belongs:\n", "![Google embedding projector](assets/embedding-projector.png \"Google embedding projector\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### UMAP visualization of the graph embeddings\n", "\n", "\n", "Very few vodkas, hard to see them in the visualization.\n", "\n", "\n", "![UMAP visualization](assets/graph-embedding-umap-13.png \"UMAP visualization of graph embeddings\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### UMAP visualization of the text embeddings\n", "Very few vodkas, har to see them in the visualization.\n", "\n", "\n", "![UMAP visualization](assets/text-embedding-umap-17.png \"UMAP visualization of text embeddings\")" ] }, { "cell_type": "raw", "metadata": {}, "source": [ "!$kgtk lexicalize -i $OUT/all.tsv.gz \\\n", "--label-properties label \\\n", "--isa-properties P31 P279 P452 P106 \\\n", "--description-properties description \\\n", "--property-value P186 P17 P127 P176 \\\n", "--has-properties \"\" \\\n", "--add-entity-labels-from-input True \\\n", "-o \"$TE\"/sentences.tsv " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Text search using the text embeddings" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", "\n", "class ComputeEmbeddings:\n", " def __init__(self, model_name=None):\n", " if not model_name:\n", " self.model_name = 'bert-large-nli-cls-token'\n", " else:\n", " self.model_name = model_name\n", "\n", " self.model = SentenceTransformer(self.model_name)\n", "\n", " def get_vectors(self, sentence):\n", " \"\"\"\n", " main function to get the vector representations of the descriptions\n", " \"\"\"\n", " if isinstance(sentence, bytes):\n", " sentence = sentence.decode(\"utf-8\")\n", " return self.model.encode([sentence], show_progress_bar=False)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "em = ComputeEmbeddings()" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "v = em.get_vectors(\"beer company\")[0]" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Q6119795', 0.9004269242286682),\n", " ('Q22333354', 0.88853520154953),\n", " ('Q28771839', 0.878260612487793),\n", " ('Q878975', 0.8738116025924683),\n", " ('Q4880037', 0.8519435524940491),\n", " ('Q1637028', 0.8471935987472534),\n", " ('Q28530481', 0.8351479768753052),\n", " ('Q696787', 0.8316406607627869),\n", " ('Q20571254', 0.8302997350692749),\n", " ('Q460206', 0.829006552696228)]" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "te_vectors.similar_by_vector(v)" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[90mid\u001b[39m Q6119795\n", "\u001b[42mLabel\u001b[49m Jacobsen\n", "\u001b[44mDescription\u001b[49m beer brand\n", "\u001b[30m\u001b[47minstance of\u001b[49m\u001b[39m \u001b[90m(P31)\u001b[39m\u001b[90m: \u001b[39mbrewery \u001b[90m(Q131734)\u001b[39m\n" ] } ], "source": [ "!wd u Q6119795" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtk-env", "language": "python", "name": "kgtk-env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }