{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import io\n",
"import os\n",
"import subprocess\n",
"import sys\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"from IPython.display import display, HTML\n",
"\n",
"import papermill as pm\n",
"from configure_kgtk_notebooks import ConfigureKGTK"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"tags": [
"parameters"
]
},
"outputs": [],
"source": [
"# Parameters\n",
"\n",
"# Folder on local machine where to create the output and temporary folders\n",
"input_path = \"/Users/pedroszekely/Downloads/kypher/iswc/inputs\"\n",
"output_path = \"/Users/pedroszekely/Downloads/kypher/iswc/ouptuts\"\n",
"project_name = \"arnold\"\n",
"files = \"all,label\""
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"files = files.split(\",\")\n",
"ck = ConfigureKGTK()\n",
"ck.configure_kgtk(input_graph_path=input_path,\n",
" output_path=output_path,\n",
" project_name=project_name)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"os.environ['KGTK_LABEL_FILE'] = \"{}\".format(os.environ['label']) \n",
"os.environ['kypher'] = \"kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\""
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"EXAMPLES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/examples\n",
"USE_CASES_DIR: /Users/pedroszekely/Documents/GitHub/kgtk/use-cases\n",
"GRAPH: /Users/pedroszekely/Downloads/kypher/iswc/inputs\n",
"OUT: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold\n",
"TEMP: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold\n",
"STORE: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\n",
"kgtk: kgtk --debug\n",
"kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\n",
"GRAPH: /Users/pedroszekely/Downloads/kypher/iswc/inputs\n",
"OUT: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold\n",
"TEMP: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold\n",
"STORE: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\n",
"kgtk: kgtk --debug\n",
"kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\n",
"GRAPH: /Users/pedroszekely/Downloads/kypher/iswc/inputs\n",
"OUT: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold\n",
"TEMP: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold\n",
"STORE: /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\n",
"kgtk: kgtk --debug\n",
"kypher: kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db\n",
"all: /Users/pedroszekely/Downloads/kypher/iswc/inputs/all.tsv.gz\n",
"label: /Users/pedroszekely/Downloads/kypher/iswc/inputs/labels.en.tsv.gz\n"
]
}
],
"source": [
"ck.print_env_variables(files)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"kgtk query --graph-cache /Users/pedroszekely/Downloads/kypher/iswc/ouptuts/arnold/temp.arnold/wikidata.sqlite3.db -i \"/Users/pedroszekely/Downloads/kypher/iswc/inputs/all.tsv.gz\" --as all -i \"/Users/pedroszekely/Downloads/kypher/iswc/inputs/labels.en.tsv.gz\" --as label --limit 3\n",
"node1\tlabel\tnode2\tid\n",
"P10\tP31\tQ18610173\tP10-P31-Q18610173-85ef4d24-0\n",
"P1000\tP31\tQ18608871\tP1000-P31-Q18608871-093affb5-0\n",
"P1001\tP1647\tP276\tP1001-P1647-P276-e4e44f83-0\n"
]
}
],
"source": [
"ck.load_files_into_cache(file_list=files)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# To Do\n",
"- Do partition of the graph in to the usual Wikidata files\n",
"- Compute the derived files"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Explore The Arnold Schwarzenegger Graph `Q2685`\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of nodes in the graph"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count(DISTINCT graph_1_c1.\"node1\")\n",
"218486\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[]->()' \\\n",
"--return 'count(distinct n1)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of edges in the graph (not counting qualifier edges)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count(DISTINCT graph_1_c1.\"id\")\n",
"1523735\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '()-[l]->()' \\\n",
"--return 'count(distinct l)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of qualifier edges"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count(DISTINCT graph_1_c2.\"id\")\n",
"270275\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '()-[l]->(), (l)-[q]->()' \\\n",
"--return 'count(distinct q)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of humans"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count(DISTINCT graph_1_c1.\"node1\")\n",
"10918\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P31]->(:Q5)' \\\n",
"--return 'count(distinct n1)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Number of organizations (should be updated to use P279star)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"count(DISTINCT graph_1_c1.\"node1\")\n",
"206\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P31]->(:Q43229)' \\\n",
"--return 'count(distinct n1)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Films where Schwarzenegger is a cast member"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
id | id;label |
---|
Q110397 | 'True Lies'@en |
Q15140437 | 'Terminator Genisys'@en |
Q162255 | 'The Terminator'@en |
Q170564 | 'Terminator 2: Judgment Day'@en |
Q200804 | 'Predator'@en |
Q222018 | 'Total Recall'@en |
Q2842976 | 'American Masters'@en |
Q29054009 | 'Terminator 3: Rise of the Machines'@en |
Q309003 | 'Conan the Barbarian'@en |
Q370326 | 'Eraser'@en |
Q39072454 | 'Terminator: Dark Fate'@en |
Q728267 | 'T2-3D: Battle Across Time'@en |
Q740516 | 'Conan the Destroyer'@en |
Q858840 | 'Last Action Hero'@en |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"h = !$kypher -i all \\\n",
"--match ' \\\n",
" (film)-[:P161]->(:Q2685)' \\\n",
"--return 'distinct film as id' \\\n",
"/ add-labels / html\n",
"\n",
"display(HTML(h[0]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"People who won the Bambi Award"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"id | id;label |
---|
Q11975 | 'Britney Spears'@en |
Q162389 | 'Tony Curtis'@en |
Q212648 | 'Rudy Giuliani'@en |
Q214574 | 'Jan Josef Liefers'@en |
Q221074 | 'Bud Spencer'@en |
Q229760 | 'Rita Ora'@en |
Q243430 | 'Terence Hill'@en |
Q2685 | 'Arnold Schwarzenegger'@en |
Q312674 | 'Giorgio Moroder'@en |
Q342617 | 'Ben Whishaw'@en |
Q450675 | 'Francis'@en |
Q60863 | 'Nadja Uhl'@en |
Q78766 | 'Elyas M\\'Barek'@en |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 3.42 ms, sys: 9.32 ms, total: 12.7 ms\n",
"Wall time: 1.15 s\n"
]
}
],
"source": [
"%%time\n",
"h = !$kypher -i all \\\n",
"--match ' \\\n",
" (film)-[:P166]->(:Q630018)' \\\n",
"--return 'distinct film as id' \\\n",
"/ add-labels / html\n",
"\n",
"display(HTML(h[0]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Select the subgraph that contains item to item edges"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:24:31 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT DISTINCT graph_1_c1.\"node1\" \"_aLias.node1\", graph_1_c2.\"node1\" \"_aLias.label\", graph_1_c1.\"node2\" \"_aLias.node2\", graph_1_c1.\"id\" \"_aLias.id\"\n",
" FROM graph_1 AS graph_1_c1\n",
" INNER JOIN graph_1 AS graph_1_c2\n",
" ON graph_1_c2.\"node1\" = graph_1_c1.\"label\"\n",
" AND graph_1_c1.\"label\" = graph_1_c2.\"node1\"\n",
" AND graph_1_c2.\"label\" = ?\n",
" AND graph_1_c2.\"node2\" = ?\n",
" PARAS: ['datatype', 'wikibase-item']\n",
"---------------------------------------------\n",
"CPU times: user 84.2 ms, sys: 34.2 ms, total: 118 ms\n",
"Wall time: 7.69 s\n"
]
}
],
"source": [
"%%time\n",
"!$kypher -i all \\\n",
"--match ' \\\n",
" (n1)-[r {label: property}]->(n2), \\\n",
" (property)-[l:datatype]->(:`wikibase-item`)' \\\n",
"--return 'distinct n1 as node1, property as label, n2 as node2, r as id' \\\n",
"-o \"$TEMP\"/item.edges.tsv.gz"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Compute pagerank using the undirected graph"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"node1\tlabel\tnode2\tid\n",
"P1001-P1855-Q11696-cdbf391b-0\tvertex_in_degree\t0\tP1001-P1855-Q11696-cdbf391b-0-vertex_in_degree-0\n",
"P1001-P1855-Q11696-cdbf391b-0\tvertex_out_degree\t1\tP1001-P1855-Q11696-cdbf391b-0-vertex_out_degree-1\n",
"CPU times: user 188 ms, sys: 73.5 ms, total: 261 ms\n",
"Wall time: 16.8 s\n"
]
}
],
"source": [
"%%time\n",
"!$kgtk graph-statistics -i \"$TEMP\"/item.edges.tsv.gz \\\n",
"--pagerank True \\\n",
"--hits False \\\n",
"--degrees False \\\n",
"--undirected True \\\n",
"--page-rank-property Ppagerank \\\n",
"--statistics-only True \\\n",
"-o \"$TEMP\"/pagerank.tsv.gz\n",
"\n",
"!$kypher -i \"$TEMP\"/pagerank.tsv.gz --as pagerank --limit 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Compute degrees using the directed graph"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"node1\tlabel\tnode2\tid\n",
"P1001-P1855-Q11696-cdbf391b-0\tPindegree\t0\tP1001-P1855-Q11696-cdbf391b-0-Pindegree-0\n",
"P1001-P1855-Q11696-cdbf391b-0\tPoutdegree\t1\tP1001-P1855-Q11696-cdbf391b-0-Poutdegree-1\n",
"CPU times: user 161 ms, sys: 65.2 ms, total: 227 ms\n",
"Wall time: 14.7 s\n"
]
}
],
"source": [
"%%time\n",
"!$kgtk graph-statistics -i \"$TEMP\"/item.edges.tsv.gz \\\n",
"--pagerank False \\\n",
"--hits False \\\n",
"--degrees True \\\n",
"--undirected False \\\n",
"--vertex-in-degree-property Pindegree \\\n",
"--vertex-out-degree-property Poutdegree \\\n",
"--statistics-only True \\\n",
"-o \"$TEMP\"/degrees.tsv.gz\n",
"\n",
"!$kypher -i \"$TEMP\"/degrees.tsv.gz --as degrees --limit 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Top pagerank humans, go Stalone!"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"node1 | label | node2 | node1;label |
---|
Q40026 | Ppagerank | 8.895744855562736e-05 | 'Sylvester Stallone'@en |
Q55245 | Ppagerank | 8.60081254522788e-05 | 'Laurence Olivier'@en |
Q42574 | Ppagerank | 8.053169474528737e-05 | 'James Cameron'@en |
Q76 | Ppagerank | 7.859729524225644e-05 | 'Barack Obama'@en |
Q2263 | Ppagerank | 7.455667246613304e-05 | 'Tom Hanks'@en |
Q22686 | Ppagerank | 7.389881122758954e-05 | 'Donald Trump'@en |
Q43203 | Ppagerank | 6.922374438747931e-05 | 'Clint Eastwood'@en |
Q65932 | Ppagerank | 6.83331169060919e-05 | 'Anthony Hopkins'@en |
Q35332 | Ppagerank | 6.772067757039425e-05 | 'Brad Pitt'@en |
Q7542 | Ppagerank | 6.675128769339656e-05 | 'Prince'@en |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 2.79 ms, sys: 8.17 ms, total: 11 ms\n",
"Wall time: 1.59 s\n"
]
}
],
"source": [
"%%time\n",
"h=!$kypher -i all -i pagerank \\\n",
"--match ' \\\n",
" all: (:Q5)<-[:P31]-(n1), \\\n",
" pagerank: (n1)-[l:Ppagerank]->(pagerank)' \\\n",
"--return 'n1 as node1, l.label as label, pagerank as node2' \\\n",
"--order-by 'cast(pagerank, float) desc' \\\n",
"--limit 10 \\\n",
"/ add-labels / html\n",
"\n",
"display(HTML(h[0]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In degree"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"node1 | label | node2 | node1;label |
---|
Q791801 | Pindegree | 16962 | 'estimation process'@en |
Q30 | Pindegree | 13437 | 'United States of America'@en |
Q5 | Pindegree | 10967 | 'human'@en |
Q6581097 | Pindegree | 7193 | 'male'@en |
Q39825 | Pindegree | 7177 | 'census'@en |
Q1860 | Pindegree | 6899 | 'English'@en |
Q15911027 | Pindegree | 3824 | 'demographic balance'@en |
Q15221623 | Pindegree | 3178 | 'bilateral relation'@en |
Q183 | Pindegree | 2984 | 'Germany'@en |
Q28378282 | Pindegree | 2674 | 'verified account'@en |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.02 ms, sys: 10.1 ms, total: 14.1 ms\n",
"Wall time: 1.32 s\n"
]
}
],
"source": [
"%%time\n",
"h=!$kypher -i all -i degrees \\\n",
"--match ' \\\n",
" degrees: (n1)-[l:Pindegree]->(degree)' \\\n",
"--return 'n1 as node1, l.label as label, degree as node2' \\\n",
"--order-by 'cast(degree, int) desc' \\\n",
"--limit 10 \\\n",
"/ add-labels / html\n",
"\n",
"display(HTML(h[0]))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Out degree"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"node1 | label | node2 | node1;label |
---|
Q8704 | Poutdegree | 98 | 'Walt Disney'@en |
Q22686 | Poutdegree | 87 | 'Donald Trump'@en |
Q23505 | Poutdegree | 85 | 'George H. W. Bush'@en |
Q76 | Poutdegree | 85 | 'Barack Obama'@en |
Q9916 | Poutdegree | 81 | 'Dwight D. Eisenhower'@en |
Q38111 | Poutdegree | 73 | 'Leonardo DiCaprio'@en |
Q103876 | Poutdegree | 72 | 'Peter O\\'Toole'@en |
Q9960 | Poutdegree | 71 | 'Ronald Reagan'@en |
Q2105 | Poutdegree | 69 | 'Jacques Chirac'@en |
Q9582 | Poutdegree | 68 | 'Gerald Ford'@en |
"
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"h=!$kypher -i all -i degrees \\\n",
"--match ' \\\n",
" degrees: (n1)-[l:Poutdegree]->(degree), all: (n1)-[:P31]->(:Q5)' \\\n",
"--return 'n1 as node1, l.label as label, degree as node2' \\\n",
"--order-by 'cast(degree, int) desc' \\\n",
"--limit 10 \\\n",
"/ add-labels / html\n",
"\n",
"display(HTML(h[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"q-nodes with ULAN id"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:25:02 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT count(DISTINCT graph_1_c1.\"node1\")\n",
" FROM graph_1 AS graph_1_c1\n",
" WHERE graph_1_c1.\"label\" = ?\n",
" PARAS: ['P245']\n",
"---------------------------------------------\n",
"count(DISTINCT graph_1_c1.\"node1\")\n",
"431\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P245]->(ulan_id)' \\\n",
"--return 'count(distinct n1)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sample of nodes with ULAN ids"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:25:03 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT DISTINCT graph_1_c1.\"node1\" \"_aLias.node1\", graph_1_c1.\"label\" \"_aLias.label\", graph_1_c1.\"node2\" \"_aLias.node2\"\n",
" FROM graph_1 AS graph_1_c1\n",
" WHERE graph_1_c1.\"label\" = ?\n",
" LIMIT ?\n",
" PARAS: ['P245', 10]\n",
"---------------------------------------------\n",
"| node1 | label | node2 | node1;label | label;label |\n",
"| --------- | ----- | ----------- | -------------------------------------- | ---------------------------------- |\n",
"| Q100948 | P245 | \"500224955\" | 'Rachel Carson'@en | 'Union List of Artist Names ID'@en |\n",
"| Q101771 | P245 | \"500281177\" | 'Gottfried Gruben'@en | 'Union List of Artist Names ID'@en |\n",
"| Q101791 | P245 | \"500001235\" | 'Sep Ruf'@en | 'Union List of Artist Names ID'@en |\n",
"| Q102139 | P245 | \"500256782\" | 'Margrethe II of Denmark'@en | 'Union List of Artist Names ID'@en |\n",
"| Q1024362 | P245 | \"500302331\" | 'Spanish National Research Council'@en | 'Union List of Artist Names ID'@en |\n",
"| Q1024426 | P245 | \"500286871\" | 'University of South Carolina'@en | 'Union List of Artist Names ID'@en |\n",
"| Q102711 | P245 | \"500114625\" | 'Dennis Hopper'@en | 'Union List of Artist Names ID'@en |\n",
"| Q10288082 | P245 | \"500304375\" | 'Wildenstein & Company'@en | 'Union List of Artist Names ID'@en |\n",
"| Q103876 | P245 | \"500355461\" | 'Peter O\\'Toole'@en | 'Union List of Artist Names ID'@en |\n",
"| Q1065 | P245 | \"500354403\" | 'United Nations'@en | 'Union List of Artist Names ID'@en |\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[l:P245]->(ulan_id)' \\\n",
"--return 'distinct n1 as node1, l.label as label, ulan_id as node2' \\\n",
"--limit 10 \\\n",
"/ add-labels / table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count of q-nodes with ULAN ids and place of birth (P19)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:25:03 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT count(DISTINCT graph_1_c1.\"node1\")\n",
" FROM graph_1 AS graph_1_c1\n",
" INNER JOIN graph_1 AS graph_1_c2\n",
" ON graph_1_c1.\"node1\" = graph_1_c2.\"node1\"\n",
" AND graph_1_c1.\"label\" = ?\n",
" AND graph_1_c2.\"label\" = ?\n",
" PARAS: ['P245', 'P19']\n",
"---------------------------------------------\n",
"count(DISTINCT graph_1_c1.\"node1\")\n",
"143\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P245]->(ulan_id), (n1)-[:P19]->()' \\\n",
"--return 'count(distinct n1)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sample of q-nodes that have ULAN ids, but don't have birth place"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:25:04 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT DISTINCT graph_1_c1.\"node1\" \"_aLias.node1\"\n",
" FROM graph_1 AS graph_1_c1\n",
" LEFT JOIN graph_1 AS graph_1_c2\n",
" ON graph_1_c1.\"node1\" = graph_1_c2.\"node1\"\n",
" AND graph_1_c2.\"label\" = ?\n",
" WHERE graph_1_c1.\"label\" = ?\n",
" AND (graph_1_c2.\"node2\" IS NULL)\n",
" LIMIT ?\n",
" PARAS: ['P19', 'P245', 10]\n",
"---------------------------------------------\n",
"| node1 | node1;label |\n",
"| --------- | -------------------------------------- |\n",
"| Q100948 | 'Rachel Carson'@en |\n",
"| Q102139 | 'Margrethe II of Denmark'@en |\n",
"| Q1024362 | 'Spanish National Research Council'@en |\n",
"| Q1024426 | 'University of South Carolina'@en |\n",
"| Q102711 | 'Dennis Hopper'@en |\n",
"| Q10288082 | 'Wildenstein & Company'@en |\n",
"| Q1065 | 'United Nations'@en |\n",
"| Q1065414 | 'Vrije Universiteit Amsterdam'@en |\n",
"| Q1066442 | 'Charles Moore'@en |\n",
"| Q1068072 | 'University of Delaware'@en |\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P245]->(ulan_id)' \\\n",
"--opt '(n1)-[:P19]->(birth_place)' \\\n",
"--where: 'birth_place is null' \\\n",
"--return 'distinct n1 as node1' \\\n",
"--limit 10 \\\n",
"/ add-labels / table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Count of q-nodes with ULAN id and birth date"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:25:05 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT count(DISTINCT graph_1_c1.\"node1\")\n",
" FROM graph_1 AS graph_1_c1\n",
" INNER JOIN graph_1 AS graph_1_c2\n",
" ON graph_1_c1.\"node1\" = graph_1_c2.\"node1\"\n",
" AND graph_1_c1.\"label\" = ?\n",
" AND graph_1_c2.\"label\" = ?\n",
" PARAS: ['P245', 'P569']\n",
"---------------------------------------------\n",
"count(DISTINCT graph_1_c1.\"node1\")\n",
"240\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P245]->(ulan_id), (n1)-[:P569]->()' \\\n",
"--return 'count(distinct n1)'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Sample of q-nodes that have ULAN ids, but don't have birth date (P569)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"| node1 | node1;label |\n",
"| -------- | ---------------------- |\n",
"| Q1280275 | 'Pan Painter'@en |\n",
"| Q133337 | 'Solon'@en |\n",
"| Q1405 | 'Augustus'@en |\n",
"| Q168261 | 'Ptolemy I Soter'@en |\n",
"| Q2039 | 'Titus Livius'@en |\n",
"| Q26825 | 'Herodotos'@en |\n",
"| Q43353 | 'Aristophanes'@en |\n",
"| Q5264 | 'Hippocrates'@en |\n",
"| Q577906 | 'Antimenes Painter'@en |\n",
"| Q902022 | 'Darius Painter'@en |\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[:P245]->(ulan_id), (n1)-[:P31]->(:Q5)' \\\n",
"--opt '(n1)-[:P569]->(birth_place)' \\\n",
"--where: 'birth_place is null' \\\n",
"--return 'distinct n1 as node1' \\\n",
"--limit 10 \\\n",
"/ add-labels / table"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2021-10-01 17:25:07 query]: SQL Translation:\n",
"---------------------------------------------\n",
" SELECT DISTINCT graph_1_c1.\"node1\" \"_aLias.node\", graph_1_c1.\"label\" \"_aLias.label\", graph_1_c1.\"node2\" \"_aLias.node2\"\n",
" FROM graph_1 AS graph_1_c1\n",
" WHERE graph_1_c1.\"label\" = ?\n",
" PARAS: ['P245']\n",
"---------------------------------------------\n"
]
}
],
"source": [
"!$kypher -i all \\\n",
"--match '(n1)-[l:P245]->(ulan_id)' \\\n",
"--return 'distinct n1 as node, l.label as label, ulan_id as node2' \\\n",
"-o \"$TEMP\"/ulan-qnodes.tsv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"%%time\n",
"!$kgtk --debug graph-embeddings -i \"$TEMP\"/item.edges.tsv.gz \\\n",
"--dimension 20 \\\n",
"--output_format kgtk \\\n",
"-o \"$OUT\"/graph-embeddings.tsv.gz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "kgtk-env",
"language": "python",
"name": "kgtk-env"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}