{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "electronic-closure", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import gzip\n", "import csv\n", "import re" ] }, { "cell_type": "markdown", "id": "sporting-cemetery", "metadata": {}, "source": [ "## Create the file `labels.en.years.tsv.gz`\n", "\n", "get instances of class: calendar year\n", "1. kgtk query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[id:P31]->(:Q3186692)' -o parts/claims.years.2.tsv.gz\n", "\n", "get instances of class: year\n", "\n", "2. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q577)' -o parts/claims.years.1.tsv.gz\n", "\n", "get instances of class: common year\n", "\n", "3. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q235729)' -o parts/claims.years.3.tsv.gz\n", "\n", "get instances of class: leap year\n", "\n", "4. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q19828)' -o parts/claims.years.4.tsv.gz\n", "\n", "get instances of class: century leap year\n", "\n", "5. kgtk --debug query -i parts/claims.wikibase-item.tsv.gz --graph-cache $GRAPH_CACHE --match '(n)-[:P31]->(:Q3311614)' -o parts/claims.years.5.tsv.gz\n", "\n", "6. kgtk cat -i claims.years.1.tsv.gz -i claims.years.2.tsv.gz -i claims.years.3.tsv.gz -i claims.years.4.tsv.gz -i claims.years.5.tsv.gz -o claims.years.tsv.gz\n", "\n", "7. kgtk --debug query -i claims.years.tsv.gz \\\n", "-i labels.en.tsv.gz \\\n", "--graph-cache $GRAPH_CACHE \\\n", "--match 'claims: (n1)-[l]->(), labels: (n1)-[m]->(n2)' \\\n", "--return 'n1, m.label, n2' -o labels.en.years.tsv.gz" ] }, { "cell_type": "markdown", "id": "universal-feedback", "metadata": {}, "source": [ "## Create the file `claims.time.year.tsv.gz`\n", "\n", "kgtk query --graph-cache $GRAPH_CACHE \\\n", "-i claims.time.tsv.gz \\\n", "-o claims.time.year.tsv.gz \\\n", "--match '(s)-[i]->(o)' \\\n", "--where 'kgtk_date_precision(o) >= 9' \\\n", "--return 's, i.label, o as node2_full, kgtk_date_year(o) as node2'" ] }, { "cell_type": "code", "execution_count": 2, "id": "outer-organizer", "metadata": {}, "outputs": [], "source": [ "claims_time_file = 'claims.time.year.tsv.gz'\n", "years_label_file = 'labels.en.years.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 3, "id": "variable-portfolio", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2_full\tnode2\n", "P1841\tP580\t^2016-01-01T00:00:00Z/9\t2016\n", "P2847\tP2669\t^2019-04-02T00:00:00Z/11\t2019\n", "P3284\tP576\t^2016-12-13T00:00:00Z/11\t2016\n", "P370\tP571\t^2013-03-29T00:00:00Z/11\t2013\n", "P6107\tP580\t^2006-03-01T00:00:00Z/10\t2006\n", "Q100\tP571\t^1630-09-07T00:00:00Z/11\t1630\n", "Q1000\tP571\t^1960-01-01T00:00:00Z/9\t1960\n", "Q10000\tP571\t^2001-06-19T00:00:00Z/11\t2001\n", "Q1000000\tP580\t^2010-01-25T00:00:00Z/11\t2010\n", "gzcat: error writing to output: Broken pipe\n", "gzcat: claims.time.year.tsv.gz: uncompress failed\n" ] } ], "source": [ "!gzcat {claims_time_file} | head" ] }, { "cell_type": "code", "execution_count": 4, "id": "external-albuquerque", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\n", "Q100594618\tlabel\t'3022'@en\n", "Q102149476\tlabel\t'2384'@en\n", "Q103831815\tlabel\t'2688'@en\n", "Q103838319\tlabel\t'2691'@en\n", "Q1044959\tlabel\t'2064'@en\n", "Q1044959\tlabel\t'2064'@en\n", "Q1044976\tlabel\t'2063'@en\n", "Q1046496\tlabel\t'2062'@en\n", "Q1062045\tlabel\t'Yang Fire Horse'@en\n", "gzcat: error writing to output: Broken pipe\n", "gzcat: labels.en.years.tsv.gz: uncompress failed\n" ] } ], "source": [ "!gzcat {years_label_file} | head" ] }, { "cell_type": "code", "execution_count": 5, "id": "alternate-ancient", "metadata": {}, "outputs": [], "source": [ "year_qnodes = {}\n", "year_properties = {}\n", "year_labels = {}\n", "_not_present_years = set()" ] }, { "cell_type": "code", "execution_count": 6, "id": "optimum-vancouver", "metadata": {}, "outputs": [], "source": [ "# P155: follows\n", "# P156: followed by\n", "\n", "def create_year_qnode(year, year_dict):\n", " year = str(year.strip())\n", "\n", " y_qnode = f\"Q{year}-Year\"\n", " prev_year = f'Q{str(int(year) - 1)}-Year'\n", " next_year = f'Q{str(int(year) + 1)}-Year'\n", " if year in year_dict:\n", " return {'qnode': year_dict[year], 'wiki_year': True}\n", " _not_present_years.add(year)\n", " if year not in year_qnodes:\n", " year_qnodes[year] = {\n", " 'qnode': y_qnode,\n", " 'P31': 'Q3186692',\n", " 'P155': prev_year,\n", " 'P156': next_year,\n", " 'label': f'\\\"{year}\\\"',\n", " 'wiki_year': False\n", " }\n", " return year_qnodes[year]" ] }, { "cell_type": "code", "execution_count": 7, "id": "opened-macro", "metadata": {}, "outputs": [], "source": [ "def create_time_properties(property):\n", " property = str(property).strip()\n", " pnode = f'{property}-Year'\n", " if property not in year_properties:\n", " year_properties[property] = {\n", " 'P31':'Q18636219',\n", " 'pnode': pnode,\n", " 'label': f'\\\"{property}-Year\\\"'\n", " }\n", " return year_properties[property]" ] }, { "cell_type": "code", "execution_count": 8, "id": "executive-google", "metadata": {}, "outputs": [], "source": [ "def min_qnode(qnodes_list):\n", " _ = [int(x[1:]) for x in qnodes_list]\n", " return f'Q{min(_)}'" ] }, { "cell_type": "code", "execution_count": 9, "id": "parallel-advocate", "metadata": {}, "outputs": [], "source": [ "year_regex = r'(\\d*)'\n", "def create_wiki_years_dict(years_label_file):\n", " _ = {}\n", " f = gzip.open(years_label_file, 'rt')\n", " for line in f:\n", " if 'node1' not in line:\n", " vals = line.strip().split('\\t')\n", " qnode = vals[0].strip()\n", " q_label = vals[2].strip()\n", " if '@' in q_label:\n", " _label = q_label.split('@')[0].replace(\"'\", \"\")\n", " try:\n", " _label = int(_label)\n", " except:\n", "\n", " _label = ''\n", " if _label != '':\n", " _[qnode] = _label\n", " freq_dict = {}\n", " for k, v in _.items():\n", " if v not in freq_dict:\n", " freq_dict[v] = list()\n", " freq_dict[v].append(k)\n", "\n", " freq_dict = {k: v for k, v in sorted(freq_dict.items(), key=lambda item: len(item[1]), reverse=True)}\n", " correct_dict = {k : min_qnode(v) for k, v in freq_dict.items()}\n", " return {str(k): v for k, v in correct_dict.items()}" ] }, { "cell_type": "code", "execution_count": 10, "id": "wireless-convenience", "metadata": {}, "outputs": [], "source": [ "year_labels = create_wiki_years_dict(years_label_file)" ] }, { "cell_type": "code", "execution_count": 11, "id": "congressional-innocent", "metadata": {}, "outputs": [], "source": [ "f = gzip.open(claims_time_file, 'rt')\n", "output = []\n", "for line in f:\n", " if 'node1' in line and 'node2' in line:\n", " continue\n", " vals = line.strip().split('\\t')\n", " assert(len(vals) == 4)\n", " year = vals[3]\n", " property = vals[1]\n", " y_qnode = create_year_qnode(year, year_labels)['qnode']\n", " y_pnode = create_time_properties(property)['pnode']\n", " output.append({\n", " 'node1': vals[0],\n", " 'label': y_pnode,\n", " 'node2': y_qnode,\n", " })\n" ] }, { "cell_type": "code", "execution_count": 12, "id": "celtic-transition", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "186 ['2286', '3443', '2760', '2543', '2510', '2396', '2385', '9822', '3523', '4004', '5355', '2266', '2566', '3606', '2563', '2986', '4301', '2459', '2269', '2518', '2366', '3870', '6969', '2267', '3718', '2341', '3841', '2387', '2371', '2864', '3190', '2305', '3576', '2365', '2646', '2774', '3016', '2425', '2370', '2315', '2913', '2399', '2989', '2358', '7962', '3998', '4401', '2367', '2763', '4226', '8888', '2470', '2468', '2738', '3179', '2393', '2319', '5079', '3049', '3494', '2362', '2260', '2355', '2731', '2356', '2608', '2904', '3483', '2276', '2303', '3955', '3454', '2622', '3056', '2633', '2265', '2359', '3993', '2893', '2293', '2841', '2552', '2250', '3031', '2980', '2457', '2490', '3599', '2897', '1000', '3550', '2452', '3425', '2282', '2506', '2375', '2318', '2499', '2524', '3009', '3982', '2987', '4000', '2364', '2332', '2348', '2655', '2484', '2716', '3237', '2333', '2485', '2974', '2964', '2374', '3173', '2323', '2203', '2340', '2551', '2233', '4126', '2336', '2532', '3566', '2593', '8113', '5108', '2344', '2263', '2230', '2312', '3012', '3378', '2528', '2517', '2357', '2335', '3114', '3595', '2565', '3617', '2420', '2369', '5000', '3758', '2289', '2268', '3890', '2493', '2926', '2474', '2372', '9701', '2327', '9152', '2394', '2346', '2368', '3302', '6963', '2857', '3729', '2379', '3501', '2329', '2264', '2378', '2511', '2991', '2679', '2929', '3881', '3961', '3313', '2473', '3400', '3257', '2865', '2235', '2597', '2373', '9999', '2391', '3333', '2347']\n" ] } ], "source": [ "print(len(_not_present_years), list(_not_present_years))" ] }, { "cell_type": "code", "execution_count": 13, "id": "municipal-worse", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "14463714" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(output)" ] }, { "cell_type": "code", "execution_count": 14, "id": "christian-struggle", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(output)" ] }, { "cell_type": "code", "execution_count": 15, "id": "favorite-variance", "metadata": {}, "outputs": [], "source": [ "df.to_csv('claims.time.year.augmented.1.tsv.gz', index=False, sep='\\t')" ] }, { "cell_type": "code", "execution_count": 16, "id": "incredible-example", "metadata": {}, "outputs": [], "source": [ "!kgtk add-id --id-style wikidata -i 'claims.time.year.augmented.1.tsv.gz' -o 'claims.time.year.augmented.2.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 17, "id": "configured-sample", "metadata": {}, "outputs": [], "source": [ "_years = []\n", "_years_labels = []\n", "_properties = []\n", "_properties_labels = []\n", "yp = ['P31', 'P155', 'P156']\n", "ypp = ['P31']" ] }, { "cell_type": "code", "execution_count": 18, "id": "consolidated-modification", "metadata": {}, "outputs": [], "source": [ "for k in year_qnodes:\n", " __ = year_qnodes[k]\n", " for p in yp:\n", " if not __['wiki_year']:\n", " _years.append({\n", " 'node1': __['qnode'],\n", " 'label': p,\n", " 'node2': __[p]\n", " })\n", " _years_labels.append({\n", " 'node1': __['qnode'],\n", " 'label': 'label',\n", " 'node2':f'\"{__[\"label\"]}\"'\n", " })\n", "df_y = pd.DataFrame(_years)\n", "df_yl = pd.DataFrame(_years_labels)\n", "df_y.to_csv('metadata.augmented.time.years.1.tsv.gz', index=False, sep='\\t')\n", "df_yl.to_csv('metadata.augmented.labels.time.years.1.tsv.gz', index=False, sep='\\t', quoting=csv.QUOTE_NONE)" ] }, { "cell_type": "code", "execution_count": 19, "id": "every-killer", "metadata": {}, "outputs": [], "source": [ "!kgtk add-id --id-style wikidata -i 'metadata.augmented.time.years.1.tsv.gz' -o 'metadata.augmented.time.years.tsv.gz'\n", "!kgtk add-id --id-style wikidata -i 'metadata.augmented.labels.time.years.1.tsv.gz' -o 'metadata.augmented.labels.time.years.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 20, "id": "considerable-picnic", "metadata": {}, "outputs": [], "source": [ "for k in year_properties:\n", " __ = year_properties[k]\n", " for p in ypp:\n", " _properties.append({\n", " 'node1': __['pnode'],\n", " 'label': p,\n", " 'node2': __[p]\n", " })\n", " _properties_labels.append({\n", " 'node1': __['pnode'],\n", " 'label': 'label',\n", " 'node2': __['label']\n", " })\n", "df_py = pd.DataFrame(_properties)\n", "df_pyl = pd.DataFrame(_properties_labels)\n", "df_py.to_csv('metadata.augmented.properties.time.years.1.tsv.gz', index=False, sep='\\t')\n", "df_pyl.to_csv('metadata.augmented.labels.properties.time.years.1.tsv.gz', index=False, sep='\\t', quoting=csv.QUOTE_NONE)" ] }, { "cell_type": "code", "execution_count": 21, "id": "extended-olive", "metadata": {}, "outputs": [], "source": [ "!kgtk add-id --id-style wikidata -i 'metadata.augmented.properties.time.years.1.tsv.gz' -o 'metadata.augmented.properties.time.years.tsv.gz'\n", "!kgtk add-id --id-style wikidata -i 'metadata.augmented.labels.properties.time.years.1.tsv.gz' -o 'metadata.augmented.labels.properties.time.years.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 22, "id": "fallen-civilian", "metadata": {}, "outputs": [], "source": [ "# concatenate to create the augmented file" ] }, { "cell_type": "code", "execution_count": 23, "id": "confident-insulin", "metadata": {}, "outputs": [], "source": [ "!kgtk cat -i 'claims.time.year.augmented.2.tsv.gz' \\\n", "-i 'metadata.augmented.time.years.tsv.gz' \\\n", "-i 'metadata.augmented.properties.time.years.tsv.gz' \\\n", "-o 'claims.time.year.augmented.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 24, "id": "existing-picking", "metadata": {}, "outputs": [], "source": [ "# delete intermediate files" ] }, { "cell_type": "code", "execution_count": 25, "id": "historical-picnic", "metadata": {}, "outputs": [], "source": [ "!rm 'claims.time.year.augmented.1.tsv.gz' \\\n", "'claims.time.year.augmented.2.tsv.gz'\\\n", "'metadata.augmented.labels.properties.time.years.1.tsv.gz' \\\n", "'metadata.augmented.labels.time.years.1.tsv.gz' \\\n", "'metadata.augmented.properties.time.years.1.tsv.gz' \\\n", "'metadata.augmented.time.years.1.tsv.gz'" ] }, { "cell_type": "code", "execution_count": 26, "id": "dependent-campbell", "metadata": {}, "outputs": [], "source": [ "# cat this file with claims.wikibase-item.tsv.gz" ] }, { "cell_type": "code", "execution_count": null, "id": "laughing-shift", "metadata": {}, "outputs": [], "source": [ "kgtk cat -i claims.time.year.augmented.tsv.gz \\\n", "-i claims.wikibase-item.tsv.gz \\\n", "/ sort --extra '--parallel 24 --buffer-size 30% --temporary-directory /tmp' -o claims.wikibase-item.augmented.tsv.gz" ] } ], "metadata": { "kernelspec": { "display_name": "kgtk-env", "language": "python", "name": "kgtk-env" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 5 }