{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "judicial-latvia", "metadata": {}, "outputs": [], "source": [ "!kgtk --debug query -i ../../gdrive-kgtk-dump-2020-12-07/claims.tsv.gz \\\n", " --match \"(x)-[r{label: property}]->(y{wikidatatype: wikidatatype})\" \\\n", " --return 'r.id as `id`, x as `node1`, property as `label`, y as `node2`, wikidatatype as `node2;wikidatatype`' \\\n", " -o ../../data/claims.edited.tsv \\\n", " --graph-cache ~/temp1.sqlite3.db" ] }, { "cell_type": "code", "execution_count": null, "id": "attended-sphere", "metadata": {}, "outputs": [], "source": [ "!kgtk --debug query -i ../../data/removed_statements.tsv \\\n", " ../../gdrive-kgtk-dump-2020-12-07/metadata.property.datatypes.tsv.gz \\\n", " --match \"removed: (x)-[r{label: property}]->(y), datatypes: (property)-[]->(datatype)\" \\\n", " --return 'r.id as `id`, x as `node1`, property as `label`, y as `node2`, datatype as `node2;wikidatatype`' \\\n", " -o ../../data/removed_statements_w_datatype.tsv --graph-cache ~/temp1.sqlite3.db" ] }, { "cell_type": "code", "execution_count": null, "id": "adjacent-disorder", "metadata": {}, "outputs": [], "source": [ "!kgtk cat -i ../../data/claims.edited.tsv \\\n", " ../../data/removed_statements_w_datatype.tsv \\\n", " -o ../../data/claims.w_removed_statements.tsv" ] }, { "cell_type": "code", "execution_count": 1, "id": "juvenile-ability", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a8ed952b33f9462f86bdcae389daaf11", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1225057250 [00:00(node2), c: (rLabel)-[:P2308]->(parent), d: (node1)-[]->(par), c: (eLabel)-[:P2303]->(eNode)\" \\\n", " --where 'nodeProp.label = rLabel and (par = parent or (rLabel = eLabel and node1 = eNode))' \\\n", " --return 'nodeProp.id, node1, nodeProp.label, node2, max(parent) as `node1;ancestor`' \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/mandConst001.sqlite3.db ; \\\n", " kgtk --debug ifnotexists -i ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.all.tsv \\\n", " --filter-on ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": 4, "id": "abstract-retreat", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-03-12 22:33:26 query]: SQL Translation:\r\n", "---------------------------------------------\r\n", " SELECT graph_11_c1.\"id\", graph_11_c1.\"node1\", graph_11_c1.\"label\", graph_11_c1.\"node2\"\r\n", " FROM graph_11 AS graph_11_c1, graph_14 AS graph_14_c2\r\n", " WHERE graph_11_c1.\"node1\"=graph_14_c2.\"node1\"\r\n", " AND (graph_14_c2.\"node2\" IN (?, ?, ?))\r\n", " PARAS: ['Q1238720', 'Q3331189', 'Q47461344']\r\n", "---------------------------------------------\r\n" ] } ], "source": [ "!kgtk --debug query -i ../../propertiesSplitWRemoved2/claims.P996.tsv \\\n", " ../../wikidata-20210215/derived.isastar.tsv.gz \\\n", " --match \"m: (node1)-[nodeProp]->(node2), d: (node1)-[]->(par)\" \\\n", " --where 'par in [\"Q1238720\",\"Q3331189\",\"Q47461344\"]' \\\n", " --return 'nodeProp.id, node1, nodeProp.label, node2' \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/mandConst001.sqlite3.db ; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.P996.tsv \\\n", " --filter-on ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": 7, "id": "strange-truck", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "81289 ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.correct.tsv\r\n" ] } ], "source": [ "!wc -l ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P996.correct.tsv" ] }, { "cell_type": "code", "execution_count": 8, "id": "finnish-hampton", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-03-12 22:49:54 sqlstore]: IMPORT graph directly into table graph_15 from /data/wd-correctness/propertiesSplit/claims.P991.tsv ...\n", "[2021-03-12 22:49:54 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_15_c1.\"id\", graph_15_c1.\"node1\", graph_15_c1.\"label\", graph_15_c1.\"node2\"\n", " FROM graph_15 AS graph_15_c1, graph_5 AS graph_5_c2\n", " WHERE graph_15_c1.\"node1\"=graph_5_c2.\"node1\"\n", " AND (graph_5_c2.\"node2\" IN (?))\n", " PARAS: ['Q40231']\n", "---------------------------------------------\n", "[2021-03-12 22:49:55 sqlstore]: CREATE INDEX on table graph_15 column node1 ...\n", "[2021-03-12 22:49:55 sqlstore]: ANALYZE INDEX on table graph_15 column node1 ...\n" ] } ], "source": [ "!kgtk --debug query -i ../../propertiesSplitWRemoved2/claims.P991.tsv \\\n", " ../../wikidata-20210215/derived.P31P279star.tsv.gz \\\n", " --match \"m: (node1)-[nodeProp]->(node2), d: (node1)-[]->(par)\" \\\n", " --where 'par in [\"Q40231\"]' \\\n", " --return 'nodeProp.id, node1, nodeProp.label, node2' \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P991.correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/mandConst001.sqlite3.db ; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.P991.tsv \\\n", " --filter-on ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P991.correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P991.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": 9, "id": "elegant-reverse", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-03-12 23:54:56 sqlstore]: IMPORT graph directly into table graph_16 from /data/wd-correctness/propertiesSplit/claims.P965.tsv ...\n", "[2021-03-12 23:54:56 query]: SQL Translation:\n", "---------------------------------------------\n", " SELECT graph_16_c1.\"id\", graph_5_c2.\"node1\", graph_16_c1.\"label\", graph_16_c1.\"node2\"\n", " FROM graph_16 AS graph_16_c1, graph_5 AS graph_5_c2\n", " WHERE graph_16_c1.\"node1\"=graph_5_c2.\"node1\"\n", " AND (graph_5_c2.\"node2\" IN (?))\n", " PARAS: ['Q6023295']\n", "---------------------------------------------\n", "[2021-03-12 23:54:56 sqlstore]: CREATE INDEX on table graph_16 column node1 ...\n", "[2021-03-12 23:54:56 sqlstore]: ANALYZE INDEX on table graph_16 column node1 ...\n" ] } ], "source": [ "!kgtk --debug query -i ../../propertiesSplitWRemoved2/claims.P965.tsv \\\n", " ../../wikidata-20210215/derived.P31P279star.tsv.gz \\\n", " --match \"m: (node1)-[nodeProp]->(node2), d: (node1)-[]->(par)\" \\\n", " --where 'par in [\"Q6023295\"]' \\\n", " --return 'nodeProp.id, node1, nodeProp.label, node2' \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P965.correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/mandConst001.sqlite3.db ; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.P965.tsv \\\n", " --filter-on ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P965.correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../constraintsOP/typeConstraint/claims.type-constraints.mandatory.instanceOf.P965.incorrect.tsv" ] }, { "cell_type": "markdown", "id": "matched-strength", "metadata": {}, "source": [ "# Generate Queries" ] }, { "cell_type": "markdown", "id": "black-insured", "metadata": {}, "source": [ "## Type Constraint" ] }, { "cell_type": "markdown", "id": "interior-humor", "metadata": {}, "source": [ "### Understand Constraints File" ] }, { "cell_type": "code", "execution_count": 2, "id": "clinical-brunei", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "df = pd.read_csv('../../constraintsOP/typeConstraint/claims.type-constraints_all.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 3, "id": "assured-cleaners", "metadata": {}, "outputs": [], "source": [ "df1 = df.groupby(['node1','label']).node2.apply(lambda p: p.tolist()).reset_index()" ] }, { "cell_type": "code", "execution_count": 4, "id": "sharing-evolution", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
0P1001P2308[Q102496, Q105985, Q1140371, Q1151067, Q119768...
1P1001P2309[Q30208840]
2P1002P2308[Q630010]
3P1002P2309[Q21514624]
4P1004P2308[Q2221906, Q23413, Q3947, Q41176, Q88291]
\n", "
" ], "text/plain": [ " node1 label node2\n", "0 P1001 P2308 [Q102496, Q105985, Q1140371, Q1151067, Q119768...\n", "1 P1001 P2309 [Q30208840]\n", "2 P1002 P2308 [Q630010]\n", "3 P1002 P2309 [Q21514624]\n", "4 P1004 P2308 [Q2221906, Q23413, Q3947, Q41176, Q88291]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "still-trail", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
69P105P2308[Q16521]
70P105P2309[Q21503252]
71P105P2316[Q21502408]
\n", "
" ], "text/plain": [ " node1 label node2\n", "69 P105 P2308 [Q16521]\n", "70 P105 P2309 [Q21503252]\n", "71 P105 P2316 [Q21502408]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df1[df1['node1'] == 'P105']" ] }, { "cell_type": "markdown", "id": "solid-browser", "metadata": {}, "source": [ "### Query Generator" ] }, { "cell_type": "code", "execution_count": 6, "id": "bright-impossible", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "392202b0f3984101a4787815b59edf5e", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/4810 [00:00(node2), \" + parentFile + \": (node1)-[]->(nodex), P279star: (nodex)-[]->(par)' \\\n", " --where 'par in \" + str(parents).replace(\"'\",'\"') + \" \" + exceptionPart + \"' \\\n", " --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/const120_\" + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/\" + shellFileSuffix + str(fCnt) + \".txt;\")\n", "\n", " cnt += 1\n", " except:\n", " print(\"Something failed for prop:\",prop)\n", "\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 7, "id": "electrical-agreement", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1465" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 8, "id": "outside-stupid", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,33):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/typeConstraintValidator\"+str(i)+\".sh\")\n", " " ] }, { "cell_type": "markdown", "id": "competitive-canvas", "metadata": {}, "source": [ "### Analyze Violations" ] }, { "cell_type": "code", "execution_count": 3, "id": "casual-perth", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e2aa42859569406cb8ee7fb237917535", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "22777e2fe5a34330b18a7bc2d970b4d0", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/334 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P13032031482610[../../allConstraintsAnalysisWRemoved2/typeCon...0.012685
P30170594928017[../../allConstraintsAnalysisWRemoved2/typeCon...0.038172
P39193393158[../../allConstraintsAnalysisWRemoved2/typeCon...0.044495
P618526912[../../allConstraintsAnalysisWRemoved2/typeCon...0.042705
P3922182440[../../allConstraintsAnalysisWRemoved2/typeCon...0.021459
\n", "" ], "text/plain": [ " correct incorrect paths \\\n", "P1303 203148 2610 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P301 705949 28017 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P3919 3393 158 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P6185 269 12 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P3922 1824 40 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "\n", " violation_ratio \n", "P1303 0.012685 \n", "P301 0.038172 \n", "P3919 0.044495 \n", "P6185 0.042705 \n", "P3922 0.021459 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "typeConstDF.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "competitive-peeing", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P538010[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P558905[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P800406[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P135402[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P5051081[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P651004[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P173402[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P601403[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P2303044[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P6001016[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P231304[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P580201[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P8738014[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P231205[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
P231104[../../allConstraintsAnalysisWRemoved2/typeCon...1.0
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P538 0 10 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P5589 0 5 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P8004 0 6 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P1354 0 2 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P5051 0 81 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P6510 0 4 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P1734 0 2 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P6014 0 3 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P2303 0 44 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P6001 0 16 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P2313 0 4 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P5802 0 1 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P8738 0 14 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P2312 0 5 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P2311 0 4 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "\n", " violation_ratio \n", "P538 1.0 \n", "P5589 1.0 \n", "P8004 1.0 \n", "P1354 1.0 \n", "P5051 1.0 \n", "P6510 1.0 \n", "P1734 1.0 \n", "P6014 1.0 \n", "P2303 1.0 \n", "P6001 1.0 \n", "P2313 1.0 \n", "P5802 1.0 \n", "P8738 1.0 \n", "P2312 1.0 \n", "P2311 1.0 " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "typeConstDF.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 10, "id": "backed-corruption", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['../../allConstraintsAnalysisWRemoved2/typeConstraint_Final/normal/claims.type-constraints.instanceOf.P4945.correct.tsv',\n", " '../../allConstraintsAnalysisWRemoved2/typeConstraint_Final/normal/claims.type-constraints.instanceOf.P4945.incorrect.tsv']" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(typeConstDF.loc['P4945'].paths)" ] }, { "cell_type": "code", "execution_count": 11, "id": "clinical-lawsuit", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1465.000000\n", "mean 0.113799\n", "std 0.226303\n", "min 0.000000\n", "25% 0.006623\n", "50% 0.022537\n", "75% 0.085443\n", "max 1.000000\n", "Name: violation_ratio, dtype: float64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "typeConstDF['violation_ratio'].describe()" ] }, { "cell_type": "code", "execution_count": 12, "id": "wanted-domestic", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Type Constraint Violation Ratios')" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "typeConstDF['violation_ratio'].plot.hist(bins=100).set_title(\"Type Constraint Violation Ratios\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "sufficient-hollywood", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Type Constraint Violation Ratios (<=0.05)')" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "typeConstDF[typeConstDF['violation_ratio'] <= 0.05].violation_ratio.plot.hist(bins=100).set_title(\"Type Constraint Violation Ratios (<=0.05)\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "minor-marshall", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No. of constraints whose violation ratio is greater than mean :0/1465\n" ] } ], "source": [ "print(f\"No. of constraints whose violation ratio is greater than mean :{sum(typeConstDF['violation_ratio'] >= 5.286054)}/{len(typeConstDF)}\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "special-consensus", "metadata": {}, "outputs": [], "source": [ "# typeConstDF.sort_values(by=['incorrect'],ascending=False).head(5).paths.values" ] }, { "cell_type": "code", "execution_count": 16, "id": "excited-person", "metadata": {}, "outputs": [], "source": [ "# !cat ../../allConstraintsAnalysisWRemoved2/typeConstraint/normal/claims.type-constraints.instanceOf.P953.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": 17, "id": "revolutionary-violence", "metadata": {}, "outputs": [], "source": [ "for key1 in typeConstViolations.keys():\n", " typeConstViolations[key1]['correct'] = typeConstViolations[key1]['instanceOf']['correct'] + typeConstViolations[key1]['subclass']['correct'] + typeConstViolations[key1]['instanceOfOrSubclass']['correct']\n", " typeConstViolations[key1]['incorrect'] = typeConstViolations[key1]['instanceOf']['incorrect'] + typeConstViolations[key1]['subclass']['incorrect'] + typeConstViolations[key1]['instanceOfOrSubclass']['incorrect']\n", " typeConstViolations[key1]['VR'] = typeConstViolations[key1]['incorrect'] / (typeConstViolations[key1]['correct'] + typeConstViolations[key1]['incorrect'])\n", " " ] }, { "cell_type": "code", "execution_count": 18, "id": "emotional-favorite", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'mandatory': {'instanceOf': {'correct': 46304082, 'incorrect': 795451},\n", " 'subclass': {'correct': 2064, 'incorrect': 53},\n", " 'instanceOfOrSubclass': {'correct': 233195, 'incorrect': 3169},\n", " 'propCount': 167,\n", " 'correct': 46539341,\n", " 'incorrect': 798673,\n", " 'VR': 0.01687170484169446},\n", " 'suggestion': {'instanceOf': {'correct': 61936, 'incorrect': 18751},\n", " 'subclass': {'correct': 0, 'incorrect': 0},\n", " 'instanceOfOrSubclass': {'correct': 24237, 'incorrect': 3458},\n", " 'propCount': 11,\n", " 'correct': 86173,\n", " 'incorrect': 22209,\n", " 'VR': 0.20491410012732741},\n", " 'normal': {'instanceOf': {'correct': 425646789, 'incorrect': 5275469},\n", " 'subclass': {'correct': 98826, 'incorrect': 13611},\n", " 'instanceOfOrSubclass': {'correct': 68370289, 'incorrect': 852276},\n", " 'propCount': 1287,\n", " 'correct': 494115904,\n", " 'incorrect': 6141356,\n", " 'VR': 0.012276395548962147}}" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "typeConstViolations" ] }, { "cell_type": "code", "execution_count": 19, "id": "aggregate-impact", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratiototal
P2093148843205927027[../../allConstraintsAnalysisWRemoved2/typeCon...0.006190149770232
P147644059166208472[../../allConstraintsAnalysisWRemoved2/typeCon...0.00470944267638
P57739990807165864[../../allConstraintsAnalysisWRemoved2/typeCon...0.00413040156671
P143337028672112955[../../allConstraintsAnalysisWRemoved2/typeCon...0.00304137141627
P121533425605316565[../../allConstraintsAnalysisWRemoved2/typeCon...0.00938233742170
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P2093 148843205 927027 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P1476 44059166 208472 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P577 39990807 165864 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P1433 37028672 112955 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "P1215 33425605 316565 [../../allConstraintsAnalysisWRemoved2/typeCon... \n", "\n", " violation_ratio total \n", "P2093 0.006190 149770232 \n", "P1476 0.004709 44267638 \n", "P577 0.004130 40156671 \n", "P1433 0.003041 37141627 \n", "P1215 0.009382 33742170 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "typeConstDF['total'] = typeConstDF['correct'] + typeConstDF['incorrect']\n", "typeConstDF.sort_values(by=['total'],ascending=False).head()" ] }, { "cell_type": "markdown", "id": "bearing-kruger", "metadata": {}, "source": [ "### Find out time required" ] }, { "cell_type": "code", "execution_count": null, "id": "assumed-toner", "metadata": {}, "outputs": [], "source": [ "# from tqdm.notebook import tqdm\n", "# import os.path\n", "\n", "# cnt = 0\n", "# fCnt = 1\n", "# for prop in tqdm(df1.node1.unique()):\n", "# try:\n", "# if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv\")):\n", "# continue\n", "# relation = df1[(df1['node1'] == prop) & (df1['label'] == 'P2309')].node2.values[0][0]\n", "# type1 = df1[(df1['node1'] == prop) & (df1['label'] == 'P2316')].node2.values\n", "\n", "# parents = df1[(df1['node1'] == prop) & (df1['label'] == 'P2308')].node2.values[0]\n", "# exceptions = df1[(df1['node1'] == prop) & (df1['label'] == 'P2303')].node2.values\n", "\n", "# # print(prop, relation, type1, parents, exceptions)\n", "\n", "# if relation == \"Q21503252\":\n", "# parentFile = \"P31P279star\"\n", "# parentTitle = 'instanceOf'\n", "# elif relation == \"Q21514624\":\n", "# parentFile = \"P279star\"\n", "# parentTitle = 'subclass'\n", "# else:\n", "# parentFile = \"isastar\"\n", "# parentTitle = 'instanceOfOrSubclass'\n", "\n", "# if len(type1) != 0 and type1[0][0] == \"Q21502408\":\n", "# typeVal = \"mandatory\"\n", "# elif len(type1) != 0 and type1[0][0] == \"Q62026391\":\n", "# typeVal = \"suggestion\"\n", "# else:\n", "# typeVal = \"normal\"\n", "\n", "# if len(exceptions):\n", "# exceptionPart = \"or node1 in \" + str(exceptions[0]).replace(\"'\",'\"')\n", "# else:\n", "# exceptionPart = \"\"\n", " \n", "# if cnt % 100 == 0:\n", "# fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/TimedTypeConstraintValidator\" + str(fCnt) + \".sh\",\"w\")\n", "# fCnt += 1\n", " \n", "# fOP.write(\"{ time kgtk --debug query -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", "# ../../wikidata-20210215/derived.\" + parentFile + \".tsv.gz \\\n", "# --match 'm: (node1)-[nodeProp]->(node2), d: (node1)-[]->(par)' \\\n", "# --where 'par in \" + str(parents).replace(\"'\",'\"') + \" \" + exceptionPart + \"' \\\n", "# --return 'nodeProp.id, node1, nodeProp.label, node2' \\\n", "# -o ../../allConstraintsAnalysisWRemoved2/TimedTypeConstraint/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".correct.tsv \\\n", "# --graph-cache ~/sqlite3_caches/const2123_\" + str(fCnt) + \".sqlite3.db; } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/TimedTypeConstraint_TimedTypeConstraintValidator\" + str(fCnt) + \".txt ; \\\n", "# kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", "# --filter-on ../../allConstraintsAnalysisWRemoved2/TimedTypeConstraint/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".correct.tsv \\\n", "# --filter-mode NONE \\\n", "# --input-keys node1 label \\\n", "# --filter-keys node1 label \\\n", "# -o ../../allConstraintsAnalysisWRemoved2/TimedTypeConstraint/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".incorrect.tsv\\n\")\n", "\n", "# cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", " " ] }, { "cell_type": "code", "execution_count": 93, "id": "veterinary-fault", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "52944ea021934d23b3d4ab3fb1f091f7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/122 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "sns.lineplot(data=pd.Series(times)).set_title(\"Distribution of times (in s) taken for type constraint checks\")" ] }, { "cell_type": "markdown", "id": "intense-computer", "metadata": {}, "source": [ "## Value Type Constraint" ] }, { "cell_type": "markdown", "id": "animated-companion", "metadata": {}, "source": [ "### Understand Constraints File" ] }, { "cell_type": "code", "execution_count": 9, "id": "static-profit", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "dfValueType = pd.read_csv('../../constraintsOP/valuetypeConstraint/claims.type-constraints_all1.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 10, "id": "worthy-malawi", "metadata": {}, "outputs": [], "source": [ "dfValueType = dfValueType.groupby(['node1','label']).node2.apply(lambda p: p.tolist()).reset_index()" ] }, { "cell_type": "code", "execution_count": 11, "id": "eleven-tiffany", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
0P1000P2308[Q1241356]
1P1000P2309[Q30208840]
2P1001P2308[Q20926517, Q2881272, Q2882257, Q3624078, Q389...
3P1001P2309[Q30208840]
4P1002P2308[Q2576663]
\n", "
" ], "text/plain": [ " node1 label node2\n", "0 P1000 P2308 [Q1241356]\n", "1 P1000 P2309 [Q30208840]\n", "2 P1001 P2308 [Q20926517, Q2881272, Q2882257, Q3624078, Q389...\n", "3 P1001 P2309 [Q30208840]\n", "4 P1002 P2308 [Q2576663]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfValueType.head()" ] }, { "cell_type": "code", "execution_count": 12, "id": "expired-stuff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['P2308', 'P2309', 'P2303', 'P2316', 'P6607', 'P2304'], dtype=object)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfValueType['label'].unique()" ] }, { "cell_type": "code", "execution_count": 13, "id": "imposed-newsletter", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [node1, label, node2]\n", "Index: []" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfValueType[dfValueType['label'] == '2316']" ] }, { "cell_type": "code", "execution_count": 14, "id": "answering-alabama", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
330P1659P2308[Q18616576]
331P1659P2309[Q21503252]
332P1659P2316[Q21502408]
\n", "
" ], "text/plain": [ " node1 label node2\n", "330 P1659 P2308 [Q18616576]\n", "331 P1659 P2309 [Q21503252]\n", "332 P1659 P2316 [Q21502408]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfValueType[dfValueType['node1'] == 'P1659']" ] }, { "cell_type": "code", "execution_count": 15, "id": "danish-blackberry", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
node1labelnode2
2031P991P2308[Q5, Q7210356]
2032P991P2309[Q21503252]
\n", "
" ], "text/plain": [ " node1 label node2\n", "2031 P991 P2308 [Q5, Q7210356]\n", "2032 P991 P2309 [Q21503252]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfValueType[dfValueType.node1 == 'P991']" ] }, { "cell_type": "markdown", "id": "digital-harvard", "metadata": {}, "source": [ "### Query Generator" ] }, { "cell_type": "code", "execution_count": 16, "id": "white-badge", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "080f8e771b7448de82088862b4330e8b", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/932 [00:00(node2), \" + parentFile + \": (node2)-[]->(nodex), P279star: (nodex)-[]->(par)' \\\n", " --where 'par in \" + str(parents).replace(\"'\",'\"') + \" \" + exceptionPart + \"' \\\n", " --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/const112_\" + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + typeVal + \"/claims.type-constraints.\" + parentTitle + \".\"+ prop +\".incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/\" + shellFileSuffix + str(fCnt) + \".txt;\\n\")\n", " \n", " cnt += 1\n", " except:\n", " print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 17, "id": "qualified-cursor", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "904" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 18, "id": "simplified-cameroon", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,9):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/valueTypeConstraintValidator\"+str(i)+\".sh\")\n", " " ] }, { "cell_type": "markdown", "id": "spectacular-warner", "metadata": {}, "source": [ "### Analyze Violations" ] }, { "cell_type": "code", "execution_count": 20, "id": "valid-defense", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "38d78b0ecfdc40f596565c00c4b4fbd8", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "cf62ec681d004b5c84cbcfa4e5968788", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/216 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P85245264[../../allConstraintsAnalysisWRemoved2/valuety...0.000883
P85316004[../../allConstraintsAnalysisWRemoved2/valuety...0.002494
P23024791826[../../allConstraintsAnalysisWRemoved2/valuety...0.000542
P309275347[../../allConstraintsAnalysisWRemoved2/valuety...0.000928
P30961119310[../../allConstraintsAnalysisWRemoved2/valuety...0.000893
\n", "" ], "text/plain": [ " correct incorrect paths \\\n", "P852 4526 4 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P853 1600 4 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P2302 47918 26 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P3092 7534 7 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P3096 11193 10 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "\n", " violation_ratio \n", "P852 0.000883 \n", "P853 0.002494 \n", "P2302 0.000542 \n", "P3092 0.000928 \n", "P3096 0.000893 " ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valTypeConstDF.head()" ] }, { "cell_type": "code", "execution_count": 26, "id": "neural-trail", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P50080341961[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P610409808[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P254501378[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P26680179[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P7374044[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P3028015[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P2839015[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P3027013[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P2127012[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P538010[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P224106[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P442506[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P619105[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P653305[../../allConstraintsAnalysisWRemoved2/valuety...1.0
P653405[../../allConstraintsAnalysisWRemoved2/valuety...1.0
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P5008 0 341961 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P6104 0 9808 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P2545 0 1378 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P2668 0 179 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P7374 0 44 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P3028 0 15 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P2839 0 15 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P3027 0 13 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P2127 0 12 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P538 0 10 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P2241 0 6 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P4425 0 6 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P6191 0 5 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P6533 0 5 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "P6534 0 5 [../../allConstraintsAnalysisWRemoved2/valuety... \n", "\n", " violation_ratio \n", "P5008 1.0 \n", "P6104 1.0 \n", "P2545 1.0 \n", "P2668 1.0 \n", "P7374 1.0 \n", "P3028 1.0 \n", "P2839 1.0 \n", "P3027 1.0 \n", "P2127 1.0 \n", "P538 1.0 \n", "P2241 1.0 \n", "P4425 1.0 \n", "P6191 1.0 \n", "P6533 1.0 \n", "P6534 1.0 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valTypeConstDF.sort_values(by=['violation_ratio','incorrect'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 27, "id": "cutting-polyester", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 904.000000\n", "mean 0.098485\n", "std 0.214803\n", "min 0.000000\n", "25% 0.001492\n", "50% 0.011225\n", "75% 0.063950\n", "max 1.000000\n", "Name: violation_ratio, dtype: float64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valTypeConstDF['violation_ratio'].describe()" ] }, { "cell_type": "code", "execution_count": 28, "id": "alert-receiver", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Value Type Constraint Violation Ratios')" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEICAYAAACwDehOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAaZElEQVR4nO3de5gdVZnv8e8PEgjIJUIiA0mgQUBAUYkB4vGZkQFULkKYERSO4TYR1ME5OuoAokdxxmHwGRVkxhuCQ7jIRRgxAh7lJhwdA4YBuXMIGMgFSLgkEK4G3vPHWr0omt3d1emuvbs7v8/z7KerVtWuelft6nr3Wqv23ooIzMzMANbqdABmZjZ8OCmYmVnhpGBmZoWTgpmZFU4KZmZWOCmYmVnhpDDKSApJ23Y6DkskfVTSr9q4v19IOrLGel35XBmzmvs5SdJZq/PcdpK0paSVktbudCwjhZPCMCPp/0j6xxblMyQ9urr/xIOM6a78j7VS0suSXqjMn9SmGDaSdLqkh/N+H8jzExrc5zmSvjaYbUTEBRHx/pr7O0rSb/pY/n1J57Yof4ekFyVtEhH7RsTswcTcYvt7SFpULYuIUyLiY0O5n7yvo/I5tlLS05L+IOmDA3j+Akl7V+J8OCI2iIiXhzrW0cpJYfiZDcyUpB7lhwMXRMSqdgcUEW/N/1gbAP8X+FT3fESc0vT+Ja0DXAu8FdgH2Ah4N/AEsFvT++8jrnYn6NnAX0t6Q4/yw4ErIuLJNsfTlN/lc2088F3gIknjOxrRmiQi/BhGD2A9YAXwF5WyNwIvAO8gXQR/BywHHgH+HVinsm4A2+bpXwMfqyw7CvhNZX4H4GrgSeA+4MM14ivbBO4EDqgsGws8DuwCdOVYjgWW5Fg/X1l3LeBE4AHSxf0SYJNe9vkx4DFggz7i2jHHthy4Cziwsuwc4DvAlcAzwE3Am/MyAacBS4GngTuAt+W4/wS8BKwEfp7XXwCcANwOvAiMqdTjGeBu4K/6OOYBfAK4P8f6nRzDjvk1fjnvb3kv9bwPOKIyv3Y+vjNavD5rAV8CHsr1OxfYOC/rfn3G5PmjgXtyHR4EPp7L3wA8D7yS41oJbAGcDJxfiePAfNyX5xh2rCxbAHw+H7MVwMXAuF7q1/N4rZ/j3DXPvxm4jnTOPA5cAIzPy87LcT6f4zy+RT23AOaQzvn5wDGVfe0GzMvnwWPAtzp9PejEo+MB+NHiRYEfAmdV5j8O3Jan3wVMzxejrvyP/JnKurWSQv5nX5gvBmNIF/LHgZ36ia160TkeuLiybAZwR57u/me8MO9rZ2AZsHde/mlgLjAZWBf4AXBhL/u8CJjdR0xj8z/4ScA6wJ6ki9tb8vJzeLVVMSZfSC7Kyz4A3EJ6V9p9cd688ryv9djXAuA2YAqwXi47JF9s1gI+Ajxb2UY55pXX54q8vy3zMdmn1bq91PWLwDWV+Q/kbYxt8fr8TT4u2wAbAP8JnNfj9em+WO5PuuAKeC/wHDA1L9sDWNQjjpPJSQHYPtf5ffm1OD7vd53KMbs5H6NNSOfsJ3qpXzkGpIR3HCkxvymXbZv3sy4wEbgROL3H67N3Zb5nPW8ktT7GAe/Mx27PvOx3wOF5egNgeqevBZ14uPtoeJoNHCxpXJ4/IpcREbdExNyIWBURC0gX0/euxj4+CCyIiP/I27oVuIx0gavrfGA/SRvl+cNJ79aqvhoRz0bEHcB/AIfl8k8AX4yIRRHxIukic3AvXTKbkloavZlO+ic+NSJeiojrSBfewyrr/DQibo7U/XYB6YIAqTWwIanVpIi4JyL62hfAGRGxMCKeB4iIn0TEkoh4JSIuJrUC+urWOjUilkfEw8D1lVjqOA94r6TJef4I4McR8acW636U9G73wYhYCXwBOLTVMY6IKyPigUhuAH4F/HnNmD4CXBkRV+c4vkFq8f6Pyjpn5GP0JPBz+q7zdEnLSS2nbwAzI2JpjnN+3s+LEbEM+BY1z39JU4D3ACdExAsRcRtwFukYQjoXtpU0ISJWRsTcOtsdbZwUhqGI+A3pXftBkt5MusD8GEDS9pKuyIPOTwOnAKsz2LoVsLuk5d0P0kXkzwYQ5xLgt8CHcp/vvqQLbtXCyvRDpHeL3fv/aWXf95C6TjZrsasngM37CGULYGFEvNJjX5Mq849Wpp8jJRFyAvl3UjfOUklnVpJcb6p1QtIRkm6r1OVt9P2atIyljpxIbiSNO20AHETqFmplC9Jx6PYQqaX0umMsaV9JcyU9meuwXz916HU/+XVYSI3j34u5ETGe1G06h0pykrSZpIskLc7n//kDjPPJiHimUlY9T2aRWj33Svr9QAa4RxMnheHrXNI7mJnALyPisVz+PeBeYLuI2IjUZdJzULrbs6Q+2W7VC/5C4IaIGF95bBARnxxgnLNzjIeQBggX91g+pTK9Jan/u3v/+/bY/7gWzwe4BvhAiwHWbkuAKZKq5/OWQKttvU5EnBER7wJ2Il0U/qF7UW9P6Z6QtBWpu+9TwKb5YnYnvb8mfYZSc73ZpFbZh4A/RsQtvay3hJR8u20JrCL1lxeS1iW1Er8BbJbrcBWv1qG/uF6zn3yTxBRqHv/e5NbNJ4HDJe2Si0/J8eycz/+ZvPZY9xXrEmATSRtWysp5EhH3R8RhwJuArwOX9nHOjVpOCsPXucDewDHkrqNsQ9JA2EpJO5D+aXpzG+lulfXzZxdmVZZdAWwv6XBJY/NjV0k7DjDOy4GppDGCVu9Y/3fe/1tJ4xcX5/LvA/+cL6pImihpRi/7OI+URC6TtIOktSRtmu+V3480cPwccHyuxx7AAaSxiD7lOu8uaSwpib5AGqyEdPHcpp9NvIF0IVqWt3c0qaWwOh4DJue7rfpyGeli9lVee270dCHw95K2zq2KU0hjQD3vYFuH1Ee/DFglaV+gehvtY8CmkjbuZT+XAPtL2isfx8+RBuH/q5969Ct3N50FfDkXbUgaRF4haRKvJvBqrC1fs4hYmGP6F0njJL2d9D9xPoCkmZIm5pbO8vy0V1ptazRzUhim8njBf5EuOnMqiz4P/E/SQOoPefUi28pppEG6x0gXj9K1k5vQ7wcOJb2DepT07mjdAcb5POkitTVpILOnG0iDjtcC34iI7g9yfTvX61eSniENOu/eyz5eJCXIe0l3Sz1NGricANwUES+RksC+pG6375Lu0Lm3RhU2Ih3Hp0hdCU8A/5qXnQ3slLuFLu8ltruBb5IGKR8jDaj/tsZ+W7mOdAfPo5Ie722liHiWdMwn8/ruuqofkRLqjcAfSQnv71ps7xngf5Eu7k+Rzq85leX3khLMg/lYbNHj+feR3rH/G+n4H0C6K+2lfupb1+mksau3kxLhVNJdTFfy+nPuX4Av5Tg/32Jbh5EGn5cAPwW+EhHX5GX7AHdJWkk6Pw/tHjdakyjCP7JjgyPpy8D2ETGzUtZFuhCNbfHO1MyGqbZ/OtZGF0mbkJrgh3c6FjMbPHcf2WqTdAypr/8XEXFjp+Mxs8Fz95GZmRVuKZiZWTGixxQmTJgQXV1dnQ7DzGxEueWWWx6PiImtlo3opNDV1cW8efM6HYaZ2Ygi6aHelrn7yMzMCicFMzMrnBTMzKxwUjAzs8JJwczMCicFMzMrnBTMzKxwUjAzs8JJwczMihH9iebB6DrxyjK94NT9OxiJmdnw4ZaCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVjScFSWtLulXSFXl+a0k3SZov6WJJ6+TydfP8/Ly8q+nYzMzstdrRUvg0cE9l/uvAaRGxLfAUMCuXzwKeyuWn5fXMzKyNGk0KkiYD+wNn5XkBewKX5lVmAwfl6Rl5nrx8r7y+mZm1SdMthdOB44FX8vymwPKIWJXnFwGT8vQkYCFAXr4ir/8ako6VNE/SvGXLljUYupnZmqexpCDpg8DSiLhlKLcbEWdGxLSImDZx4sSh3LSZ2RqvyZ/jfA9woKT9gHHARsC3gfGSxuTWwGRgcV5/MTAFWCRpDLAx8ESD8ZmZWQ+NtRQi4gsRMTkiuoBDgesi4qPA9cDBebUjgZ/l6Tl5nrz8uoiIpuIzM7PX68TnFE4APitpPmnM4OxcfjawaS7/LHBiB2IzM1ujNdl9VETEr4Ff5+kHgd1arPMCcEg74jEzs9b8iWYzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrHBSMDOzwknBzMwKJwUzMyucFMzMrGgsKUgaJ+lmSX+QdJekr+byrSXdJGm+pIslrZPL183z8/PyrqZiMzOz1ppsKbwI7BkR7wDeCewjaTrwdeC0iNgWeAqYldefBTyVy0/L65mZWRs1lhQiWZlnx+ZHAHsCl+by2cBBeXpGnicv30uSmorPzMxer9ExBUlrS7oNWApcDTwALI+IVXmVRcCkPD0JWAiQl68ANm2xzWMlzZM0b9myZU2Gb2a2xmk0KUTEyxHxTmAysBuwwxBs88yImBYR0yZOnDjYzZmZWUVb7j6KiOXA9cC7gfGSxuRFk4HFeXoxMAUgL98YeKId8ZmZWdLk3UcTJY3P0+sB7wPuISWHg/NqRwI/y9Nz8jx5+XUREU3FZ2Zmrzem/1VW2+bAbElrk5LPJRFxhaS7gYskfQ24FTg7r382cJ6k+cCTwKENxmZmZi00lhQi4nZglxblD5LGF3qWvwAc0lQ8ZmbWP3+i2czMCicFMzMrnBTMzKxwUjAzs8JJwczMilpJQdLOTQdiZmadV7el8N38Ndh/K2njRiMyM7OOqZUUIuLPgY+SvobiFkk/lvS+RiMzM7O2qz2mEBH3A18CTgDeC5wh6V5Jf91UcGZm1l51xxTeLuk00ncX7QkcEBE75unTGozPzMzaqO7XXPwbcBZwUkQ8310YEUskfamRyMzMrO3qJoX9gecj4mUASWsB4yLiuYg4r7HozMysreqOKVwDrFeZXz+XmZnZKFI3KYyr/N4yeXr9ZkIyM7NOqZsUnpU0tXtG0ruA5/tY38zMRqC6YwqfAX4iaQkg4M+AjzQVlJmZdUatpBARv5e0A/CWXHRfRPypubDMzKwTBvLLa7sCXfk5UyUREec2EpWZmXVEraQg6TzgzcBtwMu5OAAnBTOzUaRuS2EasFNERJPBmJlZZ9W9++hO0uCymZmNYnVbChOAuyXdDLzYXRgRBzYSlZmZdUTdpHByk0GYmdnwUPeW1BskbQVsFxHXSFofWLvZ0MzMrN3qfnX2McClwA9y0STg8oZiMjOzDqk70Hwc8B7gaSg/uPOmpoIyM7POqJsUXoyIl7pnJI0hfU7BzMxGkbpJ4QZJJwHr5d9m/gnw8+bCMjOzTqibFE4ElgF3AB8HriL9XrOZmY0ide8+egX4YX6YmdkoVfe7j/5IizGEiNhmyCMyM7OOGch3H3UbBxwCbDL04ZiZWSfVGlOIiCcqj8URcTqwf7OhmZlZu9XtPppamV2L1HIYyG8xmJnZCFD3wv7NyvQqYAHw4SGPxszMOqru3Ud/2XQgZmbWeXW7jz7b1/KI+NbQhGNmZp00kLuPdgXm5PkDgJuB+5sIyszMOqNuUpgMTI2IZwAknQxcGREzmwrMzMzar+7XXGwGvFSZfymX9UrSFEnXS7pb0l2SPp3LN5F0taT789835nJJOkPSfEm397jjyczM2qBuUjgXuFnSybmVcBMwu5/nrAI+FxE7AdOB4yTtRPoepWsjYjvg2jwPsC+wXX4cC3xvIBUxM7PBq/vhtX8Gjgaeyo+jI+KUfp7zSET8d55+BriH9OM8M3g1ocwGDsrTM4BzI5kLjJe0+cCqY2Zmg1G3pQCwPvB0RHwbWCRp67pPlNQF7EJqYWwWEY/kRY/yajfUJGBh5WmLclnPbR0raZ6kecuWLRtA+GZm1p+6P8f5FeAE4Au5aCxwfs3nbgBcBnwmIp6uLouIYIA/1hMRZ0bEtIiYNnHixIE81czM+lG3pfBXwIHAswARsQTYsL8nSRpLSggXRMR/5uLHuruF8t+luXwxMKXy9Mm5zMzM2qRuUnip+q5e0hv6e4IkAWcD9/T4cNsc4Mg8fSTws0r5EfkupOnAiko3k5mZtUHdzylcIukHpMHfY4C/of8f3HkPcDhwh6TbctlJwKl5e7OAh3j1O5SuAvYD5gPPkQa2zcysjfpNCvkd/8XADsDTwFuAL0fE1X09LyJ+A6iXxXu1WD+A4/qLx8zMmtNvUoiIkHRVROwM9JkIzMxsZKs7pvDfknZtNBIzM+u4umMKuwMzJS0g3YEkUiPi7U0FZmZm7ddnUpC0ZUQ8DHygTfGYmVkH9ddSuJz07agPSbosIj7UhpjMzKxD+htTqN49tE2TgZiZWef1lxSil2kzMxuF+us+eoekp0kthvXyNLw60LxRo9GZmVlb9ZkUImLtdgViZmadN5CvzjYzs1HOScHMzAonBTMzK5wUzMyscFIwM7PCScHMzAonBTMzK5wUzMyscFIwM7PCScHMzAonBTMzK5wUzMyscFIwM7PCScHMzAonBTMzK5wUzMyscFIwM7PCScHMzAonBTMzK5wUzMyscFIwM7PCScHMzAonBTMzK5wUzMyscFIwM7PCScHMzAonBTMzK5wUzMyscFIwM7OisaQg6UeSlkq6s1K2iaSrJd2f/74xl0vSGZLmS7pd0tSm4jIzs9412VI4B9inR9mJwLURsR1wbZ4H2BfYLj+OBb7XYFxmZtaLxpJCRNwIPNmjeAYwO0/PBg6qlJ8byVxgvKTNm4rNzMxaa/eYwmYR8UiefhTYLE9PAhZW1luUy15H0rGS5kmat2zZsuYiNTNbA43p1I4jIiTFajzvTOBMgGnTpg34+a10nXhlmV5w6v5DsUkzsxGp3S2Fx7q7hfLfpbl8MTClst7kXGZmZm3U7qQwBzgyTx8J/KxSfkS+C2k6sKLSzWRmZm3SWPeRpAuBPYAJkhYBXwFOBS6RNAt4CPhwXv0qYD9gPvAccHRTcZmZWe8aSwoRcVgvi/ZqsW4AxzUVi5mZ1eNPNJuZWeGkYGZmhZOCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVTgpmZlY4KZiZWeGkYGZmhZOCmZkVTgpmZlaM6XQAw03XiVeW6QWn7t/BSMzM2s8tBTMzK5wUzMyscFIwM7PCYwp98PiCma1p3FIwM7PCLYWaqq0GcMvBzEYntxTMzKxwUjAzs8JJwczMCicFMzMrPNC8mny7qpm1WzuuO24pmJlZ4aRgZmaFu4+GgLuSzGy0cEvBzMwKtxQa5BaEmY00bimYmVnhlsIQ6/kdSf2VV1sQQ7XOSDWaWlb+riwbqYZVUpC0D/BtYG3grIg4tcMhNa63i/xA1xno+r0lGl+8rJvPizXTsEkKktYGvgO8D1gE/F7SnIi4u7ORjQwDTRx1ttPbhaCvfdV5zkAvML09d6Db9EXOWvF58VrDJikAuwHzI+JBAEkXATMAJ4UGNNFCWZ19DyZBDFUMvamTgIbyGA00pqHa12DqM9DuzKF8QzGYYz+YNxFNJJHh1N2oiOjYzqskHQzsExEfy/OHA7tHxKd6rHcscGyefQtw32rucgLw+Go+d6RyndcMrvOaYTB13ioiJrZaMJxaCrVExJnAmYPdjqR5ETFtCEIaMVznNYPrvGZoqs7D6ZbUxcCUyvzkXGZmZm0ynJLC74HtJG0taR3gUGBOh2MyM1ujDJvuo4hYJelTwC9Jt6T+KCLuanCXg+6CGoFc5zWD67xmaKTOw2ag2czMOm84dR+ZmVmHOSmYmVkx6pOCpH0k3SdpvqQTWyxfV9LFeflNkro6EOaQqlHnz0q6W9Ltkq6VtFUn4hxK/dW5st6HJIWkEX/7Yp06S/pwfq3vkvTjdsc41Gqc21tKul7Srfn83q8TcQ4VST+StFTSnb0sl6Qz8vG4XdLUQe80IkbtgzRg/QCwDbAO8Adgpx7r/C3w/Tx9KHBxp+NuQ53/Elg/T39yTahzXm9D4EZgLjCt03G34XXeDrgVeGOef1On425Dnc8EPpmndwIWdDruQdb5L4CpwJ29LN8P+AUgYDpw02D3OdpbCuWrMyLiJaD7qzOqZgCz8/SlwF6S1MYYh1q/dY6I6yPiuTw7l/SZkJGszusM8E/A14EX2hlcQ+rU+RjgOxHxFEBELG1zjEOtTp0D2ChPbwwsaWN8Qy4ibgSe7GOVGcC5kcwFxkvafDD7HO1JYRKwsDK/KJe1XCciVgErgE3bEl0z6tS5ahbpncZI1m+dc7N6SkQ084VO7Vfndd4e2F7SbyXNzd9CPJLVqfPJwExJi4CrgL9rT2gdM9D/934Nm88pWPtJmglMA97b6ViaJGkt4FvAUR0Opd3GkLqQ9iC1Bm+UtHNELO9kUA07DDgnIr4p6d3AeZLeFhGvdDqwkWK0txTqfHVGWUfSGFKT84m2RNeMWl8XImlv4IvAgRHxYptia0p/dd4QeBvwa0kLSH2vc0b4YHOd13kRMCci/hQRfwT+HylJjFR16jwLuAQgIn4HjCN9cdxoNeRfDzTak0Kdr86YAxyZpw8Gros8gjNC9VtnSbsAPyAlhJHezwz91DkiVkTEhIjoiogu0jjKgRExrzPhDok65/blpFYCkiaQupMebGOMQ61OnR8G9gKQtCMpKSxra5TtNQc4It+FNB1YERGPDGaDo7r7KHr56gxJ/wjMi4g5wNmkJuZ80oDOoZ2LePBq1vlfgQ2An+Qx9Ycj4sCOBT1INes8qtSs8y+B90u6G3gZ+IeIGLGt4Jp1/hzwQ0l/Txp0Pmokv8mTdCEpsU/I4yRfAcYCRMT3SeMm+wHzgeeAowe9zxF8vMzMbIiN9u4jMzMbACcFMzMrnBTMzKxwUjAzs8JJwczMCicFMzMrnBTMzKz4/y3XvwAqR1ZgAAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "valTypeConstDF['violation_ratio'].plot.hist(bins=100).set_title(\"Value Type Constraint Violation Ratios\")" ] }, { "cell_type": "code", "execution_count": 29, "id": "italian-motel", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Value Type Constraint Violation Ratios (<=0.04)')" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "valTypeConstDF[valTypeConstDF['violation_ratio'] <= 0.04].violation_ratio.plot.hist(bins=100).set_title(\"Value Type Constraint Violation Ratios (<=0.04)\")" ] }, { "cell_type": "code", "execution_count": 30, "id": "prescription-ceramic", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No. of constraints whose violation ratio is greater than mean :0/904\n" ] } ], "source": [ "print(f\"No. of constraints whose violation ratio is greater than mean :{sum(valTypeConstDF['violation_ratio'] >= 3.950680)}/{len(valTypeConstDF)}\")" ] }, { "cell_type": "code", "execution_count": 31, "id": "quiet-gardening", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# valTypeConstDF.sort_values(by=['violation_ratio'],ascending=False).head().paths.values" ] }, { "cell_type": "code", "execution_count": 32, "id": "documentary-pipeline", "metadata": {}, "outputs": [], "source": [ "# !head ../../allConstraintsAnalysisWRemoved2/typeConstraint/normal/claims.type-constraints.instanceOf.P7535.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": 33, "id": "tutorial-mineral", "metadata": {}, "outputs": [], "source": [ "for key1 in valueTypeConstViolations.keys():\n", " valueTypeConstViolations[key1]['correct'] = valueTypeConstViolations[key1]['instanceOf']['correct'] + valueTypeConstViolations[key1]['subclass']['correct'] + valueTypeConstViolations[key1]['instanceOfOrSubclass']['correct']\n", " valueTypeConstViolations[key1]['incorrect'] = valueTypeConstViolations[key1]['instanceOf']['incorrect'] + valueTypeConstViolations[key1]['subclass']['incorrect'] + valueTypeConstViolations[key1]['instanceOfOrSubclass']['incorrect']\n", " valueTypeConstViolations[key1]['VR'] = valueTypeConstViolations[key1]['incorrect'] / (valueTypeConstViolations[key1]['correct'] + valueTypeConstViolations[key1]['incorrect'])\n", " " ] }, { "cell_type": "code", "execution_count": 34, "id": "satellite-concern", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/plain": [ "{'mandatory': {'instanceOf': {'correct': 11564885, 'incorrect': 8245},\n", " 'subclass': {'correct': 55983, 'incorrect': 28},\n", " 'instanceOfOrSubclass': {'correct': 13090, 'incorrect': 137},\n", " 'propCount': 108,\n", " 'correct': 11633958,\n", " 'incorrect': 8410,\n", " 'VR': 0.0007223616363956198},\n", " 'suggestion': {'instanceOf': {'correct': 46189, 'incorrect': 659},\n", " 'subclass': {'correct': 127, 'incorrect': 20},\n", " 'instanceOfOrSubclass': {'correct': 0, 'incorrect': 0},\n", " 'propCount': 5,\n", " 'correct': 46316,\n", " 'incorrect': 679,\n", " 'VR': 0.01444834556867752},\n", " 'normal': {'instanceOf': {'correct': 94112173, 'incorrect': 842434},\n", " 'subclass': {'correct': 4674914, 'incorrect': 9777},\n", " 'instanceOfOrSubclass': {'correct': 77686561, 'incorrect': 289299},\n", " 'propCount': 791,\n", " 'correct': 176473648,\n", " 'incorrect': 1141510,\n", " 'VR': 0.006426872643381034}}" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "valueTypeConstViolations" ] }, { "cell_type": "markdown", "id": "traditional-shakespeare", "metadata": {}, "source": [ "### Find out time required" ] }, { "cell_type": "code", "execution_count": 78, "id": "spoken-symphony", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "878ab763f4fa4cb9a540c8bf86ea76ec", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/297 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "sns.lineplot(data=pd.Series(times)).set_title(\"Distribution of times (in s) taken for value type constraint checks\")" ] }, { "cell_type": "markdown", "id": "motivated-sympathy", "metadata": {}, "source": [ "## Item Requires Statement Constraint" ] }, { "cell_type": "markdown", "id": "chubby-glass", "metadata": {}, "source": [ "### Understand Constraints File" ] }, { "cell_type": "code", "execution_count": 35, "id": "funny-batch", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import math\n", "dfItemRequires = pd.read_csv('../../constraintsOP/itemRequiresConstraint/claims.type-constraints_all.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 36, "id": "original-expression", "metadata": {}, "outputs": [], "source": [ "dfItemRequires = dfItemRequires.groupby(['id','node1','label']).node2.apply(lambda p: p.tolist()).reset_index()" ] }, { "cell_type": "code", "execution_count": 37, "id": "adequate-symphony", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['P2306', 'P2305', 'P2316', 'P2304', 'P2303', 'P6607', 'P4155',\n", " 'P31', 'P2916', 'P4680', 'P2308'], dtype=object)" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires['label'].unique()" ] }, { "cell_type": "code", "execution_count": 38, "id": "infrared-canal", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "P2306 7182\n", "P2305 2540\n", "P2316 2523\n", "P2303 422\n", "P6607 14\n", "P2304 14\n", "P2916 5\n", "P4680 2\n", "P4155 1\n", "P2308 1\n", "P31 1\n", "Name: label, dtype: int64" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires['label'].value_counts()" ] }, { "cell_type": "code", "execution_count": 39, "id": "focused-karen", "metadata": {}, "outputs": [], "source": [ "#Reference: https://stackoverflow.com/a/17298454\n", "# dfItemRequires.pivot_table('node2', ['node1', 'id'], 'label')\n", "dfItemRequires = dfItemRequires.pivot(index=['node1','id'], columns='label', values='node2')" ] }, { "cell_type": "code", "execution_count": 40, "id": "private-boundary", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2304P2305P2306P2308P2316P2916P31P4155P4680P6607
node1id
P1006P1006-P2302-Q21503247-0451ef47-0NaNNaNNaN[P214]NaNNaNNaNNaNNaNNaNNaN
P1010P1010-P2302-Q21503247-56183614-0NaNNaNNaN[P31]NaNNaNNaNNaNNaNNaNNaN
P1010-P2302-Q21503247-fd256eaf-0NaNNaN[Q794][P17]NaN[Q21502408]NaNNaNNaNNaNNaN
P1015P1015-P2302-Q21503247-20e3bfc5-0NaNNaNNaN[P31]NaNNaNNaNNaNNaNNaNNaN
P1017P1017-P2302-Q21503247-bbac2ce3-0NaNNaNNaN[P214]NaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ "label P2303 P2304 P2305 P2306 P2308 \\\n", "node1 id \n", "P1006 P1006-P2302-Q21503247-0451ef47-0 NaN NaN NaN [P214] NaN \n", "P1010 P1010-P2302-Q21503247-56183614-0 NaN NaN NaN [P31] NaN \n", " P1010-P2302-Q21503247-fd256eaf-0 NaN NaN [Q794] [P17] NaN \n", "P1015 P1015-P2302-Q21503247-20e3bfc5-0 NaN NaN NaN [P31] NaN \n", "P1017 P1017-P2302-Q21503247-bbac2ce3-0 NaN NaN NaN [P214] NaN \n", "\n", "label P2316 P2916 P31 P4155 P4680 \\\n", "node1 id \n", "P1006 P1006-P2302-Q21503247-0451ef47-0 NaN NaN NaN NaN NaN \n", "P1010 P1010-P2302-Q21503247-56183614-0 NaN NaN NaN NaN NaN \n", " P1010-P2302-Q21503247-fd256eaf-0 [Q21502408] NaN NaN NaN NaN \n", "P1015 P1015-P2302-Q21503247-20e3bfc5-0 NaN NaN NaN NaN NaN \n", "P1017 P1017-P2302-Q21503247-bbac2ce3-0 NaN NaN NaN NaN NaN \n", "\n", "label P6607 \n", "node1 id \n", "P1006 P1006-P2302-Q21503247-0451ef47-0 NaN \n", "P1010 P1010-P2302-Q21503247-56183614-0 NaN \n", " P1010-P2302-Q21503247-fd256eaf-0 NaN \n", "P1015 P1015-P2302-Q21503247-20e3bfc5-0 NaN \n", "P1017 P1017-P2302-Q21503247-bbac2ce3-0 NaN " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires.head()" ] }, { "cell_type": "code", "execution_count": 41, "id": "conceptual-schedule", "metadata": {}, "outputs": [], "source": [ "dfItemRequires = dfItemRequires.droplevel(1)" ] }, { "cell_type": "code", "execution_count": 42, "id": "third-hayes", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2304P2305P2306P2308P2316P2916P31P4155P4680P6607
node1
P1006NaNNaNNaN[P214]NaNNaNNaNNaNNaNNaNNaN
P1010NaNNaNNaN[P31]NaNNaNNaNNaNNaNNaNNaN
P1010NaNNaN[Q794][P17]NaN[Q21502408]NaNNaNNaNNaNNaN
P1015NaNNaNNaN[P31]NaNNaNNaNNaNNaNNaNNaN
P1017NaNNaNNaN[P214]NaNNaNNaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ "label P2303 P2304 P2305 P2306 P2308 P2316 P2916 P31 P4155 P4680 \\\n", "node1 \n", "P1006 NaN NaN NaN [P214] NaN NaN NaN NaN NaN NaN \n", "P1010 NaN NaN NaN [P31] NaN NaN NaN NaN NaN NaN \n", "P1010 NaN NaN [Q794] [P17] NaN [Q21502408] NaN NaN NaN NaN \n", "P1015 NaN NaN NaN [P31] NaN NaN NaN NaN NaN NaN \n", "P1017 NaN NaN NaN [P214] NaN NaN NaN NaN NaN NaN \n", "\n", "label P6607 \n", "node1 \n", "P1006 NaN \n", "P1010 NaN \n", "P1010 NaN \n", "P1015 NaN \n", "P1017 NaN " ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires.head()" ] }, { "cell_type": "markdown", "id": "shaped-companion", "metadata": {}, "source": [ "However, there is one anomaly where the property does not have a co-dependency constraint associated with it, but still has a link to this constraint." ] }, { "cell_type": "code", "execution_count": 43, "id": "indian-journal", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2304P2305P2306P2308P2316P2916P31P4155P4680P6607
node1
P5447NaNNaN[Q55426051][P5446]NaNNaNNaNNaNNaN[Q46466783]NaN
P5448NaNNaN[Q55426051][P5446]NaNNaNNaNNaNNaN[Q46466783]NaN
\n", "
" ], "text/plain": [ "label P2303 P2304 P2305 P2306 P2308 P2316 P2916 P31 P4155 \\\n", "node1 \n", "P5447 NaN NaN [Q55426051] [P5446] NaN NaN NaN NaN NaN \n", "P5448 NaN NaN [Q55426051] [P5446] NaN NaN NaN NaN NaN \n", "\n", "label P4680 P6607 \n", "node1 \n", "P5447 [Q46466783] NaN \n", "P5448 [Q46466783] NaN " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires[dfItemRequires['P4680'].apply(lambda p: type(p) == list)]" ] }, { "cell_type": "code", "execution_count": 44, "id": "discrete-template", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2304P2305P2306P2308P2316P2916P31P4155P4680P6607
node1
P1010NaNNaN[Q794][P17]NaN[Q21502408]NaNNaNNaNNaNNaN
P1045NaNNaN[Q20808382, Q28218485, Q3044918][P39]NaNNaNNaNNaNNaNNaNNaN
P1045NaNNaN[Q82955][P106]NaNNaNNaNNaNNaNNaNNaN
P1045NaNNaN[Q5][P31]NaN[Q21502408]NaNNaNNaNNaNNaN
P1045NaNNaN[Q142, Q71084][P27]NaNNaNNaNNaNNaNNaNNaN
....................................
P980NaNNaN[Q34][P17]NaN[Q21502408]NaNNaNNaNNaNNaN
P981NaNNaN[Q55][P17]NaN[Q21502408]NaNNaNNaNNaNNaN
P981NaNNaN[Q1852859][P31]NaNNaNNaNNaNNaNNaNNaN
P988NaNNaN[Q928][P17]NaN[Q21502408]NaNNaNNaNNaNNaN
P990[Q49678, Q853715]NaN[Q5][P31]NaNNaNNaNNaNNaNNaNNaN
\n", "

2540 rows × 11 columns

\n", "
" ], "text/plain": [ "label P2303 P2304 P2305 P2306 \\\n", "node1 \n", "P1010 NaN NaN [Q794] [P17] \n", "P1045 NaN NaN [Q20808382, Q28218485, Q3044918] [P39] \n", "P1045 NaN NaN [Q82955] [P106] \n", "P1045 NaN NaN [Q5] [P31] \n", "P1045 NaN NaN [Q142, Q71084] [P27] \n", "... ... ... ... ... \n", "P980 NaN NaN [Q34] [P17] \n", "P981 NaN NaN [Q55] [P17] \n", "P981 NaN NaN [Q1852859] [P31] \n", "P988 NaN NaN [Q928] [P17] \n", "P990 [Q49678, Q853715] NaN [Q5] [P31] \n", "\n", "label P2308 P2316 P2916 P31 P4155 P4680 P6607 \n", "node1 \n", "P1010 NaN [Q21502408] NaN NaN NaN NaN NaN \n", "P1045 NaN NaN NaN NaN NaN NaN NaN \n", "P1045 NaN NaN NaN NaN NaN NaN NaN \n", "P1045 NaN [Q21502408] NaN NaN NaN NaN NaN \n", "P1045 NaN NaN NaN NaN NaN NaN NaN \n", "... ... ... ... ... ... ... ... \n", "P980 NaN [Q21502408] NaN NaN NaN NaN NaN \n", "P981 NaN [Q21502408] NaN NaN NaN NaN NaN \n", "P981 NaN NaN NaN NaN NaN NaN NaN \n", "P988 NaN [Q21502408] NaN NaN NaN NaN NaN \n", "P990 NaN NaN NaN NaN NaN NaN NaN \n", "\n", "[2540 rows x 11 columns]" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires[dfItemRequires['P2305'].apply(lambda p: type(p) == list)]" ] }, { "cell_type": "markdown", "id": "forced-christmas", "metadata": {}, "source": [ "### Query Generator" ] }, { "cell_type": "markdown", "id": "acquired-floor", "metadata": {}, "source": [ "#### Version 1 - Mandatory + Suggestion + Normal" ] }, { "cell_type": "code", "execution_count": 29, "id": "turkish-establishment", "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "28d37088e10e43daa81f2da30f5d8be3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3147 [00:00(node2)\"]\n", " commandWhere = \" --where '\"\n", " commandWhere = []\n", " \n", " # Version 1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", " constSet = mandatory + suggestion + normal\n", " \n", " if len(constSet) == 0:\n", " continue\n", " excptns = set()\n", " for (rowNo, constraint) in enumerate(constSet):\n", " prop2 = constraint['P2306']\n", " \n", " if type(prop2) != list:\n", " continue\n", " prop2 = prop2[0]\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv\")):\n", " print(f\"File: ../../propertiesSplitWRemoved2/claims.{prop2}.tsv does not exist\")\n", " continue\n", " \n", " commandOtherFiles += \"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv \"\n", " if type(constraint['P2303']) == list: # Exceptions present\n", " if len(excptns) == 0:\n", " excptns = set(constraint['P2303'])\n", " else:\n", " excptns = excptns.intersection(set(constraint['P2303']))\n", " if type(constraint['P2305']) == list:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->(node2_{prop2})\"]\n", " commandWhere += [\"node2_\" + prop2 + \" in \" + str(list(constraint['P2305'])).replace(\"'\",'\"')]\n", " else:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->()\"]\n", "# print(commandMatchMoreFiles)\n", " if len(commandWhere) == 0:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\"\n", " else:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\" + \" --where '\"+(\" and \".join(commandWhere))+\"'\"\n", " \n", " if cnt % 100 == 0:\n", " if fOP:\n", " fOP.close()\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " \n", " if len(excptns) == 0:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", " fOP.write(command)\n", " else:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv\\n\"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt; \\n\"\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 30, "id": "peripheral-herald", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "534" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 31, "id": "incorporated-logistics", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "6" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fCnt" ] }, { "cell_type": "code", "execution_count": 123, "id": "welcome-welding", "metadata": {}, "outputs": [], "source": [ "# from tqdm.notebook import tqdm\n", "# import os.path\n", "# import os\n", "# folderName = 'codependencyConstraint'\n", "# for prop in tqdm(dfItemRequires.index.unique()):\n", "# for subFolderName in ['Mand_Sugg_Normal', 'Mand_Normal', 'Mand', 'Normal']:\n", "# if os.path.isfile(\"../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv\") and \\\n", "# os.path.isfile(\"../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv\"):\n", "# os.system(\"kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", "# ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", "# -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_w_exceptions.tsv\")" ] }, { "cell_type": "code", "execution_count": 32, "id": "optimum-blowing", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,28):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/codepConst_MSN_Validator_new2_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "indoor-verse", "metadata": {}, "source": [ "#### Version 2 - Mandatory + Normal" ] }, { "cell_type": "code", "execution_count": 33, "id": "furnished-paradise", "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2c4f963cc8324623abcb436adbc83b2b", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3147 [00:00(node2)\"]\n", " commandWhere = \" --where '\"\n", " commandWhere = []\n", " \n", " # Version 1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", " constSet = mandatory + normal\n", " \n", " if len(constSet) == 0:\n", " continue\n", " excptns = set()\n", " for (rowNo, constraint) in enumerate(constSet):\n", " prop2 = constraint['P2306']\n", " \n", " if type(prop2) != list:\n", " continue\n", " prop2 = prop2[0]\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv\")):\n", " print(f\"File: ../../propertiesSplitWRemoved2/claims.{prop2}.tsv does not exist\")\n", " continue\n", " \n", " commandOtherFiles += \"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv \"\n", " if type(constraint['P2303']) == list: # Exceptions present\n", " if len(excptns) == 0:\n", " excptns = set(constraint['P2303'])\n", " else:\n", " excptns = excptns.intersection(set(constraint['P2303']))\n", " if type(constraint['P2305']) == list:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->(node2_{prop2})\"]\n", " commandWhere += [\"node2_\" + prop2 + \" in \" + str(list(constraint['P2305'])).replace(\"'\",'\"')]\n", " else:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->()\"]\n", "# print(commandMatchMoreFiles)\n", " if len(commandWhere) == 0:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\"\n", " else:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\" + \" --where '\"+(\" and \".join(commandWhere))+\"'\"\n", " \n", " if cnt % 100 == 0:\n", " if fOP:\n", " fOP.close()\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " \n", " if len(excptns) == 0:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", " fOP.write(command)\n", " else:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv\\n\"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt; \\n\"\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 34, "id": "searching-individual", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "475" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 44, "id": "silver-clarity", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,26):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/codepConst_MN_Validator_new2_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "prescription-access", "metadata": {}, "source": [ "#### Version 3 - Mandatory" ] }, { "cell_type": "code", "execution_count": 35, "id": "married-porter", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "be961871162c4726aefe3f576c1abcbe", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3147 [00:00(node2)\"]\n", " commandWhere = \" --where '\"\n", " commandWhere = []\n", " \n", " # Version 1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", " constSet = mandatory\n", " \n", " if len(constSet) == 0:\n", " continue\n", " excptns = set()\n", " for (rowNo, constraint) in enumerate(constSet):\n", " prop2 = constraint['P2306']\n", " \n", " if type(prop2) != list:\n", " continue\n", " prop2 = prop2[0]\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv\")):\n", " print(f\"File: ../../propertiesSplitWRemoved2/claims.{prop2}.tsv does not exist\")\n", " continue\n", " \n", " commandOtherFiles += \"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv \"\n", " if type(constraint['P2303']) == list: # Exceptions present\n", " if len(excptns) == 0:\n", " excptns = set(constraint['P2303'])\n", " else:\n", " excptns = excptns.intersection(set(constraint['P2303']))\n", " if type(constraint['P2305']) == list:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->(node2_{prop2})\"]\n", " commandWhere += [\"node2_\" + prop2 + \" in \" + str(list(constraint['P2305'])).replace(\"'\",'\"')]\n", " else:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->()\"]\n", "# print(commandMatchMoreFiles)\n", " if len(commandWhere) == 0:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\"\n", " else:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\" + \" --where '\"+(\" and \".join(commandWhere))+\"'\"\n", " \n", " if cnt % 100 == 0:\n", " if fOP:\n", " fOP.close()\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " \n", " if len(excptns) == 0:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", " fOP.write(command)\n", " else:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv\\n\"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt; \\n\"\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 36, "id": "according-blackberry", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "79" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 45, "id": "extraordinary-drawing", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,12):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/codepConst_M_Validator_new2_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "subsequent-brown", "metadata": {}, "source": [ "#### Version 4 - Normal" ] }, { "cell_type": "code", "execution_count": 38, "id": "operational-migration", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1e24472c9c45421fb77d68bd305ccfa7", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3147 [00:00(node2)\"]\n", " commandWhere = \" --where '\"\n", " commandWhere = []\n", " \n", " # Version 1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", " constSet = normal\n", " \n", " if len(constSet) == 0:\n", " continue\n", " excptns = set()\n", " for (rowNo, constraint) in enumerate(constSet):\n", " prop2 = constraint['P2306']\n", " \n", " if type(prop2) != list:\n", " continue\n", " prop2 = prop2[0]\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv\")):\n", " print(f\"File: ../../propertiesSplitWRemoved2/claims.{prop2}.tsv does not exist\")\n", " continue\n", " \n", " commandOtherFiles += \"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv \"\n", " if type(constraint['P2303']) == list: # Exceptions present\n", " if len(excptns) == 0:\n", " excptns = set(constraint['P2303'])\n", " else:\n", " excptns = excptns.intersection(set(constraint['P2303']))\n", " if type(constraint['P2305']) == list:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->(node2_{prop2})\"]\n", " commandWhere += [\"node2_\" + prop2 + \" in \" + str(list(constraint['P2305'])).replace(\"'\",'\"')]\n", " else:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->()\"]\n", "# print(commandMatchMoreFiles)\n", " if len(commandWhere) == 0:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\"\n", " else:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\" + \" --where '\"+(\" and \".join(commandWhere))+\"'\"\n", " \n", " if cnt % 100 == 0:\n", " if fOP:\n", " fOP.close()\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " \n", " if len(excptns) == 0:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", " fOP.write(command)\n", " else:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv\\n\"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt; \\n\"\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 39, "id": "harmful-binary", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "424" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 46, "id": "advance-married", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,21):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/codepConst_N_Validator_new2_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "ranging-journal", "metadata": {}, "source": [ "#### Version 5 - Suggestion" ] }, { "cell_type": "code", "execution_count": 41, "id": "missing-jordan", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "21d2dfa4582b40d09106a8adf878cdde", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3147 [00:00(node2)\"]\n", " commandWhere = \" --where '\"\n", " commandWhere = []\n", " \n", " # Version 1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", " constSet = suggestion\n", " \n", " if len(constSet) == 0:\n", " continue\n", " excptns = set()\n", " for (rowNo, constraint) in enumerate(constSet):\n", " prop2 = constraint['P2306']\n", " \n", " if type(prop2) != list:\n", " continue\n", " prop2 = prop2[0]\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv\")):\n", " print(f\"File: ../../propertiesSplitWRemoved2/claims.{prop2}.tsv does not exist\")\n", " continue\n", " \n", " commandOtherFiles += \"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv \"\n", " if type(constraint['P2303']) == list: # Exceptions present\n", " if len(excptns) == 0:\n", " excptns = set(constraint['P2303'])\n", " else:\n", " excptns = excptns.intersection(set(constraint['P2303']))\n", " if type(constraint['P2305']) == list:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->(node2_{prop2})\"]\n", " commandWhere += [\"node2_\" + prop2 + \" in \" + str(list(constraint['P2305'])).replace(\"'\",'\"')]\n", " else:\n", " commandMatchMoreFiles += [f\"{prop2}: (node1)-[]->()\"]\n", "# print(commandMatchMoreFiles)\n", " if len(commandWhere) == 0:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\"\n", " else:\n", " command = commandInit + commandOtherFiles + commandMatch + (\", \".join(commandMatchMoreFiles)) + \"'\" + \" --where '\"+(\" and \".join(commandWhere))+\"'\"\n", " \n", " if cnt % 100 == 0:\n", " if fOP:\n", " fOP.close()\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " \n", " if len(excptns) == 0:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", " fOP.write(command)\n", " else:\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv\\n\"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/timeLog_\" + shellFileSuffix + str(fCnt) + \".txt; \\n\"\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 42, "id": "soviet-forth", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 46, "id": "racial-stationery", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,5):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/codepConst_S_Validator_new_3_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "structural-envelope", "metadata": {}, "source": [ "### Merge all correct/incorrect outputs" ] }, { "cell_type": "code", "execution_count": 17, "id": "joined-invention", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "57dcbdd4c8014c9288dbb92b331a05a6", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# import os\n", "# from tqdm.notebook import tqdm\n", "\n", "# for folder in tqdm(iter(['Mand_Sugg_Normal', 'Mand_Normal', 'Mand', 'Normal'])):\n", "# folderPath = \"../../allConstraintsAnalysisWRemoved2/codependencyConstraint/\" + folder + \"/\"\n", "# correct_files_list = \" \".join([folderPath + f for f in filter(lambda f: \".correct.\" in f, os.listdir(folderPath))])\n", "# incorrect_files_list = \" \".join([folderPath + f for f in filter(lambda f: \".incorrect.\" in f, os.listdir(folderPath))])\n", "# # print(files_list)\n", "# os.system(\"{ kgtk cat -i \"+ correct_files_list + \" -o \"+folderPath+\"claims.all.correctSuperSet.tsv -v True; } 2> \"+folderPath+\"claims.all.correctSuperSet.log\")\n", "# os.system(\"{ kgtk cat -i \"+ incorrect_files_list + \" -o \"+folderPath+\"claims.all.incorrectSuperSet.tsv -v True; } 2> \"+folderPath+\"claims.all.incorrectSuperSet.log\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "stopped-bolivia", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "68395f72036a469fad8908d916303bcd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# import os\n", "# from tqdm.notebook import tqdm\n", "\n", "# for folder in tqdm(iter(['Mand_Sugg_Normal', 'Mand_Normal', 'Mand', 'Normal'])):\n", "# folderPath = \"../../allConstraintsAnalysisWRemoved2/codependencyConstraint_Final/\" + folder + \"/\"\n", "# correct_files_list = \" \".join([folderPath + f for f in filter(lambda f: \".correct.\" in f, os.listdir(folderPath))])\n", "# incorrect_files_list = \" \".join([folderPath + f for f in filter(lambda f: \".incorrect.\" in f, os.listdir(folderPath))])\n", "# # print(files_list)\n", "# os.system(\"{ kgtk cat -i \"+ correct_files_list + \" -o \"+folderPath+\"claims.all.correctSuperSet.tsv; } 2> \"+folderPath+\"claims.all.correctSuperSet.log\")\n", "# os.system(\"{ kgtk cat -i \"+ incorrect_files_list + \" -o \"+folderPath+\"claims.all.incorrectSuperSet.tsv; } 2> \"+folderPath+\"claims.all.incorrectSuperSet.log\")" ] }, { "cell_type": "code", "execution_count": null, "id": "criminal-central", "metadata": {}, "outputs": [], "source": [ "# for folder in tqdm(iter(['Mand_Sugg_Normal', 'Mand_Normal', 'Mand', 'Normal'])):\n", "# folderPath = \"../../allConstraintsAnalysisWRemoved2/codependencyConstraint/\" + folder + \"/\"\n", "# folderPathNew = \"../../allConstraintsAnalysisWRemoved2/codependencyConstraint_Final/\" + folder + \"/\"\n", "# os.system(f\"screen -dm kgtk ifnotexists -i {folderPathNew}claims.all.correctSuperSet.tsv --filter-on {folderPath}claims.all.correctSuperSet.tsv -o {folderPathNew}claims.all.correctSuperSet.diff.tsv\")\n", "# os.system(f\"screen -dm kgtk ifnotexists -i {folderPathNew}claims.all.incorrectSuperSet.tsv --filter-on {folderPath}claims.all.incorrectSuperSet.tsv -o {folderPathNew}claims.all.incorrectSuperSet.diff.tsv\")\n", " " ] }, { "cell_type": "markdown", "id": "homeless-pleasure", "metadata": {}, "source": [ "### Analyze Violations" ] }, { "cell_type": "code", "execution_count": 71, "id": "welcome-dependence", "metadata": { "scrolled": true }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "d00113c7ab5a4ed7a7b582d4991877b2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4a8045f2c85240ba92343d7ff646f249", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1206 [00:00= 3.539484)}/{len(codepConstDF1)}\")" ] }, { "cell_type": "markdown", "id": "greater-genetics", "metadata": {}, "source": [ "#### Version 2 - Mand Normal" ] }, { "cell_type": "code", "execution_count": null, "id": "constant-chance", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "codepConstDF2 = pd.DataFrame(codepConstViolations['Mand_Normal']).T" ] }, { "cell_type": "code", "execution_count": null, "id": "included-adjustment", "metadata": {}, "outputs": [], "source": [ "codepConstDF2" ] }, { "cell_type": "code", "execution_count": null, "id": "fundamental-knowing", "metadata": {}, "outputs": [], "source": [ "codepConstDF2['violation_ratio'] = codepConstDF2.apply(lambda p: p.incorrect / (p.correct + p.incorrect), axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "harmful-discipline", "metadata": {}, "outputs": [], "source": [ "codepConstDF2.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": null, "id": "unlikely-chamber", "metadata": { "scrolled": false }, "outputs": [], "source": [ "codepConstDF2.sort_values(by=['incorrect'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": null, "id": "violent-match", "metadata": { "scrolled": true }, "outputs": [], "source": [ "codepConstDF2['violation_ratio'].describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "educational-thickness", "metadata": {}, "outputs": [], "source": [ "codepConstDF2['violation_ratio'].plot.hist(bins=100).set_title(\"Co-Dependency Constraint - Version 2 - Violation Ratios\")" ] }, { "cell_type": "code", "execution_count": null, "id": "latin-mitchell", "metadata": { "scrolled": true }, "outputs": [], "source": [ "codepConstDF2[codepConstDF2['violation_ratio'] <= 0.5].violation_ratio.plot.hist(bins=100).set_title(\"Co-Dependency Constraint - Version 2 - Violation Ratios <= 0.5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "asian-forwarding", "metadata": {}, "outputs": [], "source": [ "print(f\"No. of properties whose violation ratio is greater than mean: {sum(codepConstDF2['violation_ratio'] >= 2.290915)}/{len(codepConstDF2)}\")" ] }, { "cell_type": "markdown", "id": "destroyed-flash", "metadata": {}, "source": [ "#### Version 3 - Mand" ] }, { "cell_type": "code", "execution_count": null, "id": "consecutive-plenty", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "codepConstDF3 = pd.DataFrame(codepConstViolations['Mand']).T" ] }, { "cell_type": "code", "execution_count": null, "id": "digital-mileage", "metadata": {}, "outputs": [], "source": [ "codepConstDF3" ] }, { "cell_type": "code", "execution_count": null, "id": "formed-battle", "metadata": {}, "outputs": [], "source": [ "codepConstDF3['violation_ratio'] = codepConstDF3.apply(lambda p: p.incorrect / p.correct if p.correct != 0 else p.incorrect/100, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "numerous-construction", "metadata": {}, "outputs": [], "source": [ "codepConstDF3.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": null, "id": "identified-marble", "metadata": {}, "outputs": [], "source": [ "codepConstDF3.loc['P1713']" ] }, { "cell_type": "code", "execution_count": null, "id": "established-mounting", "metadata": {}, "outputs": [], "source": [ "!head ../../allConstraintsAnalysisWRemoved2/codependencyConstraint_Final/Mand/claims.P1713.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": null, "id": "naval-functionality", "metadata": {}, "outputs": [], "source": [ "!cat ../../allConstraintsAnalysisWRemoved2/codependencyConstraint/Mand/claims.P1713.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": null, "id": "imposed-bibliography", "metadata": { "scrolled": false }, "outputs": [], "source": [ "codepConstDF3.sort_values(by=['incorrect'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": null, "id": "emotional-crown", "metadata": { "scrolled": true }, "outputs": [], "source": [ "codepConstDF3['violation_ratio'].describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "certain-freeze", "metadata": {}, "outputs": [], "source": [ "codepConstDF3['violation_ratio'].plot.hist(bins=100).set_title(\"Co-Dependency Constraint - Version 3 - Violation Ratios\")" ] }, { "cell_type": "code", "execution_count": null, "id": "cooperative-ownership", "metadata": { "scrolled": true }, "outputs": [], "source": [ "codepConstDF3[codepConstDF3['violation_ratio'] <= 0.0005].violation_ratio.plot.hist(bins=100).set_title(\"Co-Dependency Constraint - Version 3 - Violation Ratios <= 0.0005\")" ] }, { "cell_type": "code", "execution_count": null, "id": "studied-inclusion", "metadata": {}, "outputs": [], "source": [ "print(f\"No. of properties whose violation ratio is greater than mean: {sum(codepConstDF3['violation_ratio'] >= 0.922928)}/{len(codepConstDF3)}\")" ] }, { "cell_type": "markdown", "id": "protective-brazil", "metadata": {}, "source": [ "#### Version 4 - Normal" ] }, { "cell_type": "code", "execution_count": null, "id": "laughing-pressing", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "codepConstDF4 = pd.DataFrame(codepConstViolations['Normal']).T" ] }, { "cell_type": "code", "execution_count": null, "id": "loving-swift", "metadata": {}, "outputs": [], "source": [ "codepConstDF4" ] }, { "cell_type": "code", "execution_count": null, "id": "north-christian", "metadata": {}, "outputs": [], "source": [ "codepConstDF4['violation_ratio'] = codepConstDF4.apply(lambda p: p.incorrect / p.correct if p.correct != 0 else p.incorrect/100, axis=1)" ] }, { "cell_type": "code", "execution_count": null, "id": "closing-causing", "metadata": {}, "outputs": [], "source": [ "codepConstDF4.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": null, "id": "weighted-input", "metadata": {}, "outputs": [], "source": [ "# list(codepConstDF4.sort_values(by=['violation_ratio'],ascending=False).head(5).paths)" ] }, { "cell_type": "code", "execution_count": null, "id": "brief-effect", "metadata": { "scrolled": false }, "outputs": [], "source": [ "codepConstDF4.sort_values(by=['incorrect'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": null, "id": "wireless-passenger", "metadata": { "scrolled": true }, "outputs": [], "source": [ "codepConstDF4['violation_ratio'].describe()" ] }, { "cell_type": "code", "execution_count": null, "id": "civilian-arnold", "metadata": {}, "outputs": [], "source": [ "codepConstDF4['violation_ratio'].plot.hist(bins=100).set_title(\"Co-Dependency Constraint - Version 4 - Violation Ratios\")" ] }, { "cell_type": "code", "execution_count": null, "id": "threaded-cooler", "metadata": { "scrolled": true }, "outputs": [], "source": [ "codepConstDF4[codepConstDF4['violation_ratio'] <= 0.5].violation_ratio.plot.hist(bins=100).set_title(\"Co-Dependency Constraint - Version 4 - Violation Ratios <= 0.5\")" ] }, { "cell_type": "code", "execution_count": null, "id": "olympic-charlotte", "metadata": {}, "outputs": [], "source": [ "print(f\"No. of properties whose violation ratio is greater than mean: {sum(codepConstDF4['violation_ratio'] >= 2.414703)}/{len(codepConstDF4)}\")" ] }, { "cell_type": "markdown", "id": "published-affiliate", "metadata": {}, "source": [ "### Find out time required" ] }, { "cell_type": "code", "execution_count": null, "id": "aggregate-conservative", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "from tqdm.notebook import tqdm\n", "\n", "codepConstViolations = {}\n", "\n", "codepConstViolations = {}\n", "codepConstPropList = set()\n", "\n", "def extractTimes(filename):\n", " times = []\n", " with open(filename) as f:\n", " for line in f:\n", " if \"real\" in line:\n", " line = line.strip()\n", " time1 = line.split(\"\\t\")[1]\n", " mins, sec = time1.split(\"m\")\n", " mins = int(mins)\n", " sec = float(sec[:-1])\n", " times.append(60 * mins + sec)\n", " return times\n", "\n", "# codepConstViolationsSummary = {}\n", "times = []\n", "timesVersion = {\"MSN\": [], \"MN\": [], \"M\": [], \"N\": [], \"S\": []}\n", "filePath = '/data/wd-correctness/propertiesSplitWRemoved2/checkViolations/exec_logs/'\n", "for filename in tqdm(os.listdir(filePath)):\n", " if filename.startswith(\"timeLog_codepConst_\"):\n", " ver = filename.split('_')[2]\n", " tempTimes = extractTimes(filePath + filename)\n", " times += tempTimes\n", " timesVersion[ver] += tempTimes\n", "print(pd.Series(times).describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "hearing-treasury", "metadata": {}, "outputs": [], "source": [ "print(pd.Series(timesVersion['MSN']).describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "animal-vocabulary", "metadata": {}, "outputs": [], "source": [ "print(pd.Series(timesVersion['MN']).describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "gentle-accessory", "metadata": {}, "outputs": [], "source": [ "print(pd.Series(timesVersion['M']).describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "fresh-namibia", "metadata": {}, "outputs": [], "source": [ "print(pd.Series(timesVersion['N']).describe())" ] }, { "cell_type": "markdown", "id": "industrial-parcel", "metadata": {}, "source": [ "## Symmetric Constraint (Q21510862)\n", "\n", "This constraint says, if node1 has a property with this constraint, then both `(node1)-[prop]->(node2)` and `(node2)-[prop]->(node1)` must be present with few exceptions" ] }, { "cell_type": "markdown", "id": "silent-fundamentals", "metadata": {}, "source": [ "### Constraints File" ] }, { "cell_type": "code", "execution_count": 48, "id": "known-wednesday", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-04-13 18:58:46 query]: SQL Translation:\r\n", "---------------------------------------------\r\n", " SELECT *\r\n", " FROM graph_1 AS graph_1_c1\r\n", " WHERE graph_1_c1.\"label\"=?\r\n", " AND graph_1_c1.\"node2\"=?\r\n", " PARAS: ['P2302', 'Q21510862']\r\n", "---------------------------------------------\r\n" ] } ], "source": [ "!kgtk --debug query -i ../../gdrive-kgtk-dump-2020-12-07/claims.properties.tsv.gz \\\n", " ../../gdrive-kgtk-dump-2020-12-07/qualifiers.properties.tsv.gz \\\n", " --match \"p: (nodeProp1)-[nodePropEdge:P2302]->(:Q21510862)\" \\\n", " -o ../../constraintsOP/symmetricConstraint/claims.constraints_list.tsv \\\n", " --graph-cache ~/sqlite3_caches/temp1345.valuetype.sqlite3.db" ] }, { "cell_type": "code", "execution_count": 49, "id": "legal-diamond", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "qualiDF = pd.read_csv(\"../../gdrive-kgtk-dump-2020-12-07/qualifiers.properties.tsv.gz\",sep='\\t')\n", "constDF = pd.read_csv(\"../../constraintsOP/symmetricConstraint/claims.constraints_list.tsv\",sep='\\t')" ] }, { "cell_type": "code", "execution_count": 50, "id": "exceptional-morris", "metadata": {}, "outputs": [], "source": [ "constDF2 = constDF.set_index('id').join(qualiDF.set_index('node1'),rsuffix='_qualifier').drop(columns=['id', 'node2;wikidatatype_qualifier', 'rank', 'node2', 'label', 'node2;wikidatatype']).rename(columns={'label_qualifier':'label', 'node2_qualifier': 'node2'})\n", "constDF2 = constDF2.reset_index()\n", "constDF2 = constDF2.rename(columns={'index':'id'})\n", "constDF2['label'] = constDF2.label.fillna(\"P2316\")\n", "constDF2['node2'] = constDF2.node2.fillna(\"Normal\")" ] }, { "cell_type": "code", "execution_count": 51, "id": "burning-involvement", "metadata": {}, "outputs": [], "source": [ "constDF2.to_csv(\"../../constraintsOP/symmetricConstraint/claims.constraints_all.tsv\",sep=\"\\t\",index=False)" ] }, { "cell_type": "code", "execution_count": 52, "id": "naval-identification", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import math\n", "dfItemRequires = pd.read_csv('../../constraintsOP/symmetricConstraint/claims.constraints_all.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 53, "id": "considered-madison", "metadata": {}, "outputs": [], "source": [ "dfItemRequires = dfItemRequires.groupby(['id','node1','label']).node2.apply(lambda p: p.tolist()).reset_index()" ] }, { "cell_type": "code", "execution_count": 54, "id": "alone-cattle", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['P2316', 'P2303'], dtype=object)" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires['label'].unique()" ] }, { "cell_type": "code", "execution_count": 55, "id": "mighty-ordinary", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "P2316 42\n", "P2303 3\n", "Name: label, dtype: int64" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires['label'].value_counts()" ] }, { "cell_type": "code", "execution_count": 56, "id": "sensitive-alliance", "metadata": {}, "outputs": [], "source": [ "#Reference: https://stackoverflow.com/a/17298454\n", "# dfItemRequires.pivot_table('node2', ['node1', 'id'], 'label')\n", "dfItemRequires = dfItemRequires.pivot(index=['node1','id'], columns='label', values='node2')" ] }, { "cell_type": "code", "execution_count": 57, "id": "tender-valley", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2316
node1id
P1322P1322-P2302-Q21510862-85dea891-0NaN[Normal]
P1327P1327-P2302-Q21510862-a3c3a094-0NaN[Normal]
P1382P1382-P2302-Q21510862-f6bcfecf-0NaN[Normal]
P1560P1560-P2302-Q21510862-fabecaeb-0NaN[Q21502408]
P1639P1639-P2302-Q21510862-384edcd4-0NaN[Q21502408]
\n", "
" ], "text/plain": [ "label P2303 P2316\n", "node1 id \n", "P1322 P1322-P2302-Q21510862-85dea891-0 NaN [Normal]\n", "P1327 P1327-P2302-Q21510862-a3c3a094-0 NaN [Normal]\n", "P1382 P1382-P2302-Q21510862-f6bcfecf-0 NaN [Normal]\n", "P1560 P1560-P2302-Q21510862-fabecaeb-0 NaN [Q21502408]\n", "P1639 P1639-P2302-Q21510862-384edcd4-0 NaN [Q21502408]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires.head()" ] }, { "cell_type": "code", "execution_count": 58, "id": "cellular-canal", "metadata": {}, "outputs": [], "source": [ "dfItemRequires = dfItemRequires.droplevel(1)" ] }, { "cell_type": "code", "execution_count": 59, "id": "desperate-poster", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2316
node1
P1322NaN[Normal]
P1327NaN[Normal]
P1382NaN[Normal]
P1560NaN[Q21502408]
P1639NaN[Q21502408]
\n", "
" ], "text/plain": [ "label P2303 P2316\n", "node1 \n", "P1322 NaN [Normal]\n", "P1327 NaN [Normal]\n", "P1382 NaN [Normal]\n", "P1560 NaN [Q21502408]\n", "P1639 NaN [Q21502408]" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires.head()" ] }, { "cell_type": "markdown", "id": "primary-netherlands", "metadata": {}, "source": [ "### Query Generator" ] }, { "cell_type": "code", "execution_count": 60, "id": "pointed-haven", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "5ff09bb499d044ecaa4605a4ab390068", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from tqdm.notebook import tqdm\n", "import os.path\n", "import os\n", "\n", "cnt = 0\n", "fCnt = 0\n", "\n", "folderName = 'symmetricConstraint'\n", "shellFileSuffix = 'symmConst_Validator_'\n", "graph_cache_prefix = 'symm_03'\n", "\n", "for row in tqdm(dfItemRequires.iterrows()):\n", "# try:\n", " prop = row[0]\n", " constraint = row[1]\n", " mandatory = []\n", " suggestion = []\n", " normal = []\n", " prop = str(prop)\n", " \n", " if type(constraint['P2316']) == list:\n", " if constraint['P2316'][0] == 'Q21502408':\n", " sfname = 'mandatory'\n", " elif constraint['P2316'][0] == 'Q62026391':\n", " sfname = 'suggestion'\n", " elif constraint['P2316'][0] == 'Normal':\n", " sfname = 'normal'\n", " else:\n", " sfname = 'normal'\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv\")):\n", " continue\n", " \n", " command = \"{ time ( kgtk --debug query -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " ../../propertiesSplitWRemoved2/claims.\"+ prop +\".copy2.tsv \\\n", " --match 'tsv: (node1)-[nodeProp]->(node2), copy2: (node2)-[]->(node1)' \"\n", " \n", " os.system(\"cp ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv ../../propertiesSplitWRemoved2/claims.\"+ prop +\".copy2.tsv\")\n", " \n", " if cnt % 60 == 0:\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " command\n", " if type(constraint['P2303']) != list: # Exceptions not present\n", " commandRest = \" --return 'nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\"+graph_cache_prefix+\"_\" + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", "# print(command)\n", " fOP.write(command)\n", " else:\n", " excptns = constraint['P2303']\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\"+graph_cache_prefix+\"_\" + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv; \"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\" + prop + \".correct_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\"+graph_cache_prefix+\"_\" + str(fCnt) + \".sqlite3.db; \"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\" + prop + \".correct_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + sfname + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)" ] }, { "cell_type": "code", "execution_count": 61, "id": "polar-canada", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "39" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 62, "id": "virtual-disney", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,2):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/symmConst_Validator_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "coral-cheese", "metadata": {}, "source": [ "### Analyze Violations" ] }, { "cell_type": "code", "execution_count": 47, "id": "governmental-backup", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e13ba4b56db84a0f997467ec87fdcec4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c01a49837d8a448ab5a1f234a1214fe5", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/13 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P1639210525[../../allConstraintsAnalysisWRemoved2/symmetr...0.011737
P1560348815[../../allConstraintsAnalysisWRemoved2/symmetr...0.004282
P336418131[../../allConstraintsAnalysisWRemoved2/symmetr...0.000551
P2152800[../../allConstraintsAnalysisWRemoved2/symmetr...0.000000
P61852820[../../allConstraintsAnalysisWRemoved2/symmetr...0.000000
\n", "" ], "text/plain": [ " correct incorrect paths \\\n", "P1639 2105 25 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P1560 3488 15 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P3364 1813 1 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P2152 80 0 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P6185 282 0 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "\n", " violation_ratio \n", "P1639 0.011737 \n", "P1560 0.004282 \n", "P3364 0.000551 \n", "P2152 0.000000 \n", "P6185 0.000000 " ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "symmConstDF1 = pd.DataFrame(symmConstViolations['mandatory']).T\n", "symmConstDF1['violation_ratio'] = symmConstDF1.apply(lambda p: p.incorrect / (p.incorrect + p.correct), axis=1)\n", "symmConstDF1.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 53, "id": "gross-extraction", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P27891052016590[../../allConstraintsAnalysisWRemoved2/symmetr...0.058949
P188953333824740[../../allConstraintsAnalysisWRemoved2/symmetr...0.044331
P1971808641737[../../allConstraintsAnalysisWRemoved2/symmetr...0.009513
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P2789 105201 6590 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P1889 533338 24740 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P197 180864 1737 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "\n", " violation_ratio \n", "P2789 0.058949 \n", "P1889 0.044331 \n", "P197 0.009513 " ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "symmConstDF2 = pd.DataFrame(symmConstViolations['suggestion']).T\n", "symmConstDF2['violation_ratio'] = symmConstDF2.apply(lambda p: p.incorrect / (p.incorrect + p.correct), axis=1)\n", "symmConstDF2.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 54, "id": "heavy-scout", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P518805[../../allConstraintsAnalysisWRemoved2/symmetr...1.000000
P597401[../../allConstraintsAnalysisWRemoved2/symmetr...1.000000
P17061284[../../allConstraintsAnalysisWRemoved2/symmetr...0.875000
P2652500836[../../allConstraintsAnalysisWRemoved2/symmetr...0.625749
P521424146[../../allConstraintsAnalysisWRemoved2/symmetr...0.256140
P684120917693437304[../../allConstraintsAnalysisWRemoved2/symmetr...0.221346
P30321743316[../../allConstraintsAnalysisWRemoved2/symmetr...0.153473
P1382110751657[../../allConstraintsAnalysisWRemoved2/symmetr...0.130145
P2293147361969[../../allConstraintsAnalysisWRemoved2/symmetr...0.117869
P13277954706[../../allConstraintsAnalysisWRemoved2/symmetr...0.081524
P4545464[../../allConstraintsAnalysisWRemoved2/symmetr...0.080000
P45111072790[../../allConstraintsAnalysisWRemoved2/symmetr...0.066599
P5306730382[../../allConstraintsAnalysisWRemoved2/symmetr...0.053712
P34032174112[../../allConstraintsAnalysisWRemoved2/symmetr...0.048994
P46026570612622[../../allConstraintsAnalysisWRemoved2/symmetr...0.045349
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P5188 0 5 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P5974 0 1 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P1706 12 84 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P2652 500 836 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P521 424 146 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P684 12091769 3437304 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P3032 1743 316 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P1382 11075 1657 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P2293 14736 1969 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P1327 7954 706 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P4545 46 4 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P451 11072 790 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P530 6730 382 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P3403 2174 112 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "P460 265706 12622 [../../allConstraintsAnalysisWRemoved2/symmetr... \n", "\n", " violation_ratio \n", "P5188 1.000000 \n", "P5974 1.000000 \n", "P1706 0.875000 \n", "P2652 0.625749 \n", "P521 0.256140 \n", "P684 0.221346 \n", "P3032 0.153473 \n", "P1382 0.130145 \n", "P2293 0.117869 \n", "P1327 0.081524 \n", "P4545 0.080000 \n", "P451 0.066599 \n", "P530 0.053712 \n", "P3403 0.048994 \n", "P460 0.045349 " ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "symmConstDF3 = pd.DataFrame(symmConstViolations['normal']).T\n", "symmConstDF3['violation_ratio'] = symmConstDF3.apply(lambda p: p.incorrect / (p.incorrect + p.correct), axis=1)\n", "symmConstDF3.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 55, "id": "sexual-blowing", "metadata": {}, "outputs": [], "source": [ "# !head ../../allConstraintsAnalysisWRemoved2/symmetricConstraint/normal/claims.P3032.incorrect.tsv\n", "\n" ] }, { "cell_type": "code", "execution_count": 56, "id": "legitimate-aspect", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Symmetric Normal Constraint - Violation Ratios')" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "symmConstDF3.sort_values(by=['violation_ratio'],ascending=False)['violation_ratio'].plot.hist(bins=100).set_title(\"Symmetric Normal Constraint - Violation Ratios\")" ] }, { "cell_type": "markdown", "id": "unlikely-sewing", "metadata": {}, "source": [ "### Find out time required" ] }, { "cell_type": "code", "execution_count": 11, "id": "southern-reasoning", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "01675fcd83284c8ab2aa683f43fef458", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/108 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "sns.lineplot(data=pd.Series(times)).set_title(\"Distribution of times (in s) taken for symmetric constraint checks\")" ] }, { "cell_type": "markdown", "id": "informed-animal", "metadata": {}, "source": [ "## Inverse Constraint (Q21510855)\n", "\n", "This constraint says, if node1 has a property with this constraint, then both `(node1)-[prop]->(node2)` and `(node2)-[prop]->(node1)` must be present with few exceptions" ] }, { "cell_type": "markdown", "id": "dramatic-manchester", "metadata": {}, "source": [ "### Constraints File" ] }, { "cell_type": "code", "execution_count": 63, "id": "leading-server", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-04-13 19:00:13 query]: SQL Translation:\r\n", "---------------------------------------------\r\n", " SELECT *\r\n", " FROM graph_1 AS graph_1_c1\r\n", " WHERE graph_1_c1.\"label\"=?\r\n", " AND graph_1_c1.\"node2\"=?\r\n", " PARAS: ['P2302', 'Q21510855']\r\n", "---------------------------------------------\r\n" ] } ], "source": [ "!kgtk --debug query -i ../../gdrive-kgtk-dump-2020-12-07/claims.properties.tsv.gz \\\n", " ../../gdrive-kgtk-dump-2020-12-07/qualifiers.properties.tsv.gz \\\n", " --match \"p: (nodeProp1)-[nodePropEdge:P2302]->(:Q21510855)\" \\\n", " -o ../../constraintsOP/inverseConstraint/claims.constraints_list.tsv \\\n", " --graph-cache ~/sqlite3_caches/temp1345.valuetype.sqlite3.db" ] }, { "cell_type": "code", "execution_count": 64, "id": "offshore-sudan", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "id\tnode1\tlabel\tnode2\trank\tnode2;wikidatatype\r\n", "P1026-P2302-Q21510855-adc83b86-0\tP1026\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1029-P2302-Q21510855-6b55e057-0\tP1029\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P115-P2302-Q21510855-f7aa0b78-0\tP115\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1151-P2302-Q21510855-0d9aa9c6-0\tP1151\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1204-P2302-Q21510855-e3d53bb6-0\tP1204\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1283-P2302-Q21510855-0e7699bb-0\tP1283\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1308-P2302-Q21510855-2aba96b7-0\tP1308\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1365-P2302-Q21510855-c809b758-0\tP1365\tP2302\tQ21510855\tnormal\twikibase-item\r\n", "P1366-P2302-Q21510855-eee12ef8-0\tP1366\tP2302\tQ21510855\tnormal\twikibase-item\r\n" ] } ], "source": [ "!head ../../constraintsOP/inverseConstraint/claims.constraints_list.tsv" ] }, { "cell_type": "code", "execution_count": 65, "id": "received-colonial", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "qualiDF = pd.read_csv(\"../../gdrive-kgtk-dump-2020-12-07/qualifiers.properties.tsv.gz\",sep='\\t')\n", "constDF = pd.read_csv(\"../../constraintsOP/inverseConstraint/claims.constraints_list.tsv\",sep='\\t')" ] }, { "cell_type": "code", "execution_count": 66, "id": "overall-expense", "metadata": {}, "outputs": [], "source": [ "constDF2 = constDF.set_index('id').join(qualiDF.set_index('node1'),rsuffix='_qualifier').drop(columns=['id', 'node2;wikidatatype_qualifier', 'rank', 'node2', 'label', 'node2;wikidatatype']).rename(columns={'label_qualifier':'label', 'node2_qualifier': 'node2'})\n", "constDF2 = constDF2.reset_index()\n", "constDF2 = constDF2.rename(columns={'index':'id'})\n", "constDF2['label'] = constDF2.label.fillna(\"P2316\")\n", "constDF2['node2'] = constDF2.node2.fillna(\"Normal\")" ] }, { "cell_type": "code", "execution_count": 67, "id": "valid-throat", "metadata": {}, "outputs": [], "source": [ "constDF2.to_csv(\"../../constraintsOP/inverseConstraint/claims.constraints_all.tsv\",sep=\"\\t\",index=False)" ] }, { "cell_type": "code", "execution_count": 68, "id": "focused-pennsylvania", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import math\n", "dfItemRequires = pd.read_csv('../../constraintsOP/inverseConstraint/claims.constraints_all.tsv',sep='\\t')" ] }, { "cell_type": "code", "execution_count": 69, "id": "moved-rental", "metadata": {}, "outputs": [], "source": [ "dfItemRequires = dfItemRequires.groupby(['id','node1','label']).node2.apply(lambda p: p.tolist()).reset_index()" ] }, { "cell_type": "code", "execution_count": 70, "id": "attached-rings", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['P2306', 'P2316', 'P4155', 'P2303'], dtype=object)" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires['label'].unique()" ] }, { "cell_type": "code", "execution_count": 71, "id": "loving-mileage", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "P2306 110\n", "P2316 10\n", "P2303 2\n", "P4155 1\n", "Name: label, dtype: int64" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires['label'].value_counts()" ] }, { "cell_type": "code", "execution_count": 72, "id": "local-forty", "metadata": {}, "outputs": [], "source": [ "#Reference: https://stackoverflow.com/a/17298454\n", "# dfItemRequires.pivot_table('node2', ['node1', 'id'], 'label')\n", "dfItemRequires = dfItemRequires.pivot(index=['node1','id'], columns='label', values='node2')" ] }, { "cell_type": "code", "execution_count": 73, "id": "pressed-upset", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2306P2316P4155
node1id
P1026P1026-P2302-Q21510855-adc83b86-0NaN[P50]NaNNaN
P1029P1029-P2302-Q21510855-6b55e057-0NaN[P5096]NaNNaN
P115P115-P2302-Q21510855-f7aa0b78-0NaN[P466]NaNNaN
P1151P1151-P2302-Q21510855-0d9aa9c6-0NaN[P1204][Q21502408]NaN
P1204P1204-P2302-Q21510855-e3d53bb6-0NaN[P1151]NaNNaN
\n", "
" ], "text/plain": [ "label P2303 P2306 P2316 P4155\n", "node1 id \n", "P1026 P1026-P2302-Q21510855-adc83b86-0 NaN [P50] NaN NaN\n", "P1029 P1029-P2302-Q21510855-6b55e057-0 NaN [P5096] NaN NaN\n", "P115 P115-P2302-Q21510855-f7aa0b78-0 NaN [P466] NaN NaN\n", "P1151 P1151-P2302-Q21510855-0d9aa9c6-0 NaN [P1204] [Q21502408] NaN\n", "P1204 P1204-P2302-Q21510855-e3d53bb6-0 NaN [P1151] NaN NaN" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires.head()" ] }, { "cell_type": "code", "execution_count": 74, "id": "extra-stomach", "metadata": {}, "outputs": [], "source": [ "dfItemRequires = dfItemRequires.droplevel(1)" ] }, { "cell_type": "code", "execution_count": 75, "id": "seeing-marine", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelP2303P2306P2316P4155
node1
P1026NaN[P50]NaNNaN
P1029NaN[P5096]NaNNaN
P115NaN[P466]NaNNaN
P1151NaN[P1204][Q21502408]NaN
P1204NaN[P1151]NaNNaN
\n", "
" ], "text/plain": [ "label P2303 P2306 P2316 P4155\n", "node1 \n", "P1026 NaN [P50] NaN NaN\n", "P1029 NaN [P5096] NaN NaN\n", "P115 NaN [P466] NaN NaN\n", "P1151 NaN [P1204] [Q21502408] NaN\n", "P1204 NaN [P1151] NaN NaN" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dfItemRequires.head()" ] }, { "cell_type": "markdown", "id": "composite-cutting", "metadata": {}, "source": [ "### Query Generator" ] }, { "cell_type": "code", "execution_count": 76, "id": "acoustic-belarus", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "4db7288263d84bb0b8b61c4e3345a56c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from tqdm.notebook import tqdm\n", "import os.path\n", "import os\n", "\n", "cnt = 0\n", "fCnt = 0\n", "fOP = None\n", "\n", "folderName = 'inverseConstraint_Final'\n", "shellFileSuffix = 'invConst_Validator_new3_'\n", "graph_cache_file_prefix = \"inv_4_\"\n", "\n", "for prop, constraint in tqdm(dfItemRequires.iterrows()):\n", "# try:\n", " \n", " if type(constraint['P2316']) == list:\n", " if constraint['P2316'][0] == 'Q21502408':\n", " subFolderName = \"mandatory\"\n", " elif constraint['P2316'][0] == 'Q62026391':\n", " subFolderName = \"suggestion\"\n", " else:\n", " subFolderName = \"normal\"\n", " \n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv\")):\n", " continue\n", " \n", " prop2 = constraint['P2306']\n", "\n", " if type(prop2) != list:\n", " continue\n", " prop2 = prop2[0]\n", "\n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv\")):\n", " print(f\"File: ../../propertiesSplitWRemoved2/claims.{prop2}.tsv does not exist\")\n", " continue\n", " \n", " if cnt % 40 == 0:\n", " if fOP:\n", " fOP.close()\n", " fCnt += 1\n", " fOP = open(\"../../propertiesSplitWRemoved2/checkViolations/\" + shellFileSuffix + str(fCnt) + \".sh\",\"w\")\n", " \n", " \n", " command = \"{ time ( kgtk --debug query -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " ../../propertiesSplitWRemoved2/claims.\"+ prop2 +\".tsv \\\n", " --match '\"+ \\\n", " f\"{prop}: (node1)-[nodeProp]->(node2), {prop2}: (node2)-[]->(node1)' \"\n", "\n", " if type(constraint['P2303']) != list: # Exceptions not present\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/\" + shellFileSuffix + str(fCnt) + \".txt;\\n\"\n", "# print(command)\n", " fOP.write(command)\n", " else:\n", " excptns = set(constraint['P2303'])\n", " commandRest = \" --return 'distinct nodeProp.id, node1 as `node1`, nodeProp.label as `label`, node2 as `node2`' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \\\n", " kgtk --debug ifnotexists -i ../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \"\n", " \n", " commandOPFile = \"-o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".\"\n", " \n", " command += commandRest + commandOPFile + \"incorrect_wo_exceptions.tsv; \"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug query -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --match '(node1)-[]->()' --where 'node1 in \" + str(list(excptns)).replace(\"'\",'\"') + \"' \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --graph-cache ~/sqlite3_caches/\" + str(graph_cache_file_prefix) + str(fCnt) + \".sqlite3.db; \"\n", "# print(command) \n", " fOP.write(command)\n", " \n", " command = \" kgtk --debug ifnotexists -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_wo_exceptions.tsv \\\n", " --filter-on ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\" + prop + \".incorrect_w_exceptions.tsv \\\n", " --filter-mode NONE \\\n", " --input-keys node1 label \\\n", " --filter-keys node1 label \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect.tsv; \"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " command = \" kgtk cat -i ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct_wo_exceptions.tsv \\\n", " ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".incorrect_w_exceptions.tsv \\\n", " -o ../../allConstraintsAnalysisWRemoved2/\" + folderName + \"/\" + subFolderName + \"/claims.\"+ prop +\".correct.tsv ) } 2>> ../../propertiesSplitWRemoved2/checkViolations/exec_logs/\" + shellFileSuffix + str(fCnt) + \".txt; \\n\"\n", "# print(command)\n", " fOP.write(command)\n", " \n", " \n", " cnt += 1\n", "# except:\n", "# print(\"Something failed for prop:\",prop)\n", "if fOP:\n", " fOP.close()" ] }, { "cell_type": "code", "execution_count": 77, "id": "large-climb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "110" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt" ] }, { "cell_type": "code", "execution_count": 78, "id": "involved-vietnamese", "metadata": {}, "outputs": [], "source": [ "# import os\n", "# for i in range(1,7):\n", "# os.system(\"screen -dm sh ../../propertiesSplitWRemoved2/checkViolations/invConst_Validator_new3_\"+str(i)+\".sh\")" ] }, { "cell_type": "markdown", "id": "retired-audio", "metadata": {}, "source": [ "### Analyze Violations" ] }, { "cell_type": "code", "execution_count": 57, "id": "specified-evanescence", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a1a12be02d794481802d7761a975afcc", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0it [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f69d40508fec4092844cf5e53811c7c3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/12 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P267383967[../../allConstraintsAnalysisWRemoved2/inverse...0.073951
P41472868[../../allConstraintsAnalysisWRemoved2/inverse...0.027211
P41492864[../../allConstraintsAnalysisWRemoved2/inverse...0.013793
P2033187925[../../allConstraintsAnalysisWRemoved2/inverse...0.013130
P450177922[../../allConstraintsAnalysisWRemoved2/inverse...0.012215
P1151180317[../../allConstraintsAnalysisWRemoved2/inverse...0.009341
\n", "" ], "text/plain": [ " correct incorrect paths \\\n", "P2673 839 67 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P4147 286 8 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P4149 286 4 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P2033 1879 25 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P450 1779 22 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P1151 1803 17 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "\n", " violation_ratio \n", "P2673 0.073951 \n", "P4147 0.027211 \n", "P4149 0.013793 \n", "P2033 0.013130 \n", "P450 0.012215 \n", "P1151 0.009341 " ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invConstDF1 = pd.DataFrame(invConstViolations['mandatory']).T\n", "invConstDF1['violation_ratio'] = invConstDF1.apply(lambda p: p.incorrect / (p.correct + p.incorrect), axis=1)\n", "invConstDF1.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 63, "id": "valid-symposium", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P143436775003[../../allConstraintsAnalysisWRemoved2/inverse...0.576382
P155103664753103[../../allConstraintsAnalysisWRemoved2/inverse...0.048730
P156103663640868[../../allConstraintsAnalysisWRemoved2/inverse...0.037928
P62974030240[../../allConstraintsAnalysisWRemoved2/inverse...0.003231
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P1434 3677 5003 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P155 1036647 53103 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P156 1036636 40868 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P629 74030 240 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "\n", " violation_ratio \n", "P1434 0.576382 \n", "P155 0.048730 \n", "P156 0.037928 \n", "P629 0.003231 " ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invConstDF2 = pd.DataFrame(invConstViolations['suggestion']).T\n", "invConstDF2['violation_ratio'] = invConstDF2.apply(lambda p: p.incorrect / (p.correct + p.incorrect), axis=1)\n", "invConstDF2.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 64, "id": "resident-mustang", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
correctincorrectpathsviolation_ratio
P160513190[../../allConstraintsAnalysisWRemoved2/inverse...0.935961
P34486054575[../../allConstraintsAnalysisWRemoved2/inverse...0.883205
P92615[../../allConstraintsAnalysisWRemoved2/inverse...0.833333
P92515[../../allConstraintsAnalysisWRemoved2/inverse...0.833333
P10294902397[../../allConstraintsAnalysisWRemoved2/inverse...0.830274
P115694824721[../../allConstraintsAnalysisWRemoved2/inverse...0.780606
P51342354[../../allConstraintsAnalysisWRemoved2/inverse...0.701299
P38161427[../../allConstraintsAnalysisWRemoved2/inverse...0.658537
P128314052423[../../allConstraintsAnalysisWRemoved2/inverse...0.632968
P8625915[../../allConstraintsAnalysisWRemoved2/inverse...0.625000
P51328190[../../allConstraintsAnalysisWRemoved2/inverse...0.526316
P42527412938[../../allConstraintsAnalysisWRemoved2/inverse...0.517345
P2512221159[../../allConstraintsAnalysisWRemoved2/inverse...0.418421
P167764[../../allConstraintsAnalysisWRemoved2/inverse...0.400000
P25781111622[../../allConstraintsAnalysisWRemoved2/inverse...0.358915
\n", "
" ], "text/plain": [ " correct incorrect paths \\\n", "P1605 13 190 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P3448 605 4575 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P926 1 5 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P925 1 5 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P1029 490 2397 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P115 6948 24721 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P5134 23 54 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P3816 14 27 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P1283 1405 2423 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P8625 9 15 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P5132 81 90 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P425 2741 2938 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P2512 221 159 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P1677 6 4 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "P2578 1111 622 [../../allConstraintsAnalysisWRemoved2/inverse... \n", "\n", " violation_ratio \n", "P1605 0.935961 \n", "P3448 0.883205 \n", "P926 0.833333 \n", "P925 0.833333 \n", "P1029 0.830274 \n", "P115 0.780606 \n", "P5134 0.701299 \n", "P3816 0.658537 \n", "P1283 0.632968 \n", "P8625 0.625000 \n", "P5132 0.526316 \n", "P425 0.517345 \n", "P2512 0.418421 \n", "P1677 0.400000 \n", "P2578 0.358915 " ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "invConstDF3 = pd.DataFrame(invConstViolations['normal']).T\n", "invConstDF3['violation_ratio'] = invConstDF3.apply(lambda p: p.incorrect / (p.correct + p.incorrect), axis=1)\n", "invConstDF3.sort_values(by=['violation_ratio'],ascending=False).head(15)" ] }, { "cell_type": "code", "execution_count": 65, "id": "dietary-venue", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "head: cannot open ‘../../allConstraintsAnalysisWRemoved2/inverseConstraint/normal/claims.P925.incorrect.tsv’ for reading: No such file or directory\r\n" ] } ], "source": [ "!head ../../allConstraintsAnalysisWRemoved2/inverseConstraint/normal/claims.P925.incorrect.tsv" ] }, { "cell_type": "code", "execution_count": 66, "id": "entire-gauge", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Symmetric Normal Constraint - Violation Ratios')" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "invConstDF3.sort_values(by=['violation_ratio'],ascending=False)['violation_ratio'].plot.hist(bins=100).set_title(\"Symmetric Normal Constraint - Violation Ratios\")" ] }, { "cell_type": "markdown", "id": "working-stable", "metadata": {}, "source": [ "### Find out time required" ] }, { "cell_type": "code", "execution_count": 55, "id": "saved-twelve", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2e8a241c831b4968ae22d06c22c6e85e", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/122 [00:00" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "sns.lineplot(data=pd.Series(times)).set_title(\"Distribution of times (in s) taken for symmetric constraint checks\")" ] }, { "cell_type": "code", "execution_count": null, "id": "numerical-month", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "stuck-criticism", "metadata": {}, "source": [ "# Analysis on properties with constraints" ] }, { "cell_type": "code", "execution_count": 26, "id": "driven-reference", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[2021-04-03 09:14:12 query]: SQL Translation:\r\n", "---------------------------------------------\r\n", " SELECT *\r\n", " FROM graph_1 AS graph_1_c1\r\n", " WHERE graph_1_c1.\"label\"=?\r\n", " PARAS: ['P2302']\r\n", "---------------------------------------------\r\n" ] } ], "source": [ "!kgtk --debug query -i ../../gdrive-kgtk-dump-2020-12-07/claims.properties.tsv.gz \\\n", " ../../gdrive-kgtk-dump-2020-12-07/qualifiers.properties.tsv.gz \\\n", " --match \"p: (nodeProp1)-[nodePropEdge:P2302]->()\" \\\n", " -o ../../constraintsOP/claims.constraints_list.tsv \\\n", " --graph-cache ~/sqlite3_caches/temp1345.valuetype.sqlite3.db" ] }, { "cell_type": "code", "execution_count": 39, "id": "exciting-focus", "metadata": {}, "outputs": [], "source": [ "!kgtk unique -i ../../gdrive-kgtk-dump-2020-12-07/claims.properties.tsv.gz --column node1 -o ../../constraintsOP/claims.propList.tsv" ] }, { "cell_type": "code", "execution_count": 42, "id": "flush-romania", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "node1\tlabel\tnode2\r\n", "P10\tcount\t17\r\n", "P1000\tcount\t10\r\n", "P1001\tcount\t26\r\n", "P1002\tcount\t9\r\n", "P1003\tcount\t20\r\n", "P1004\tcount\t33\r\n", "P1005\tcount\t21\r\n", "P1006\tcount\t26\r\n", "P1007\tcount\t19\r\n" ] } ], "source": [ "!head ../../constraintsOP/claims.propList.tsv" ] }, { "cell_type": "code", "execution_count": 43, "id": "chemical-harris", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "props = pd.read_csv(\"../../constraintsOP/claims.constraints_list.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 44, "id": "higher-underground", "metadata": {}, "outputs": [], "source": [ "props2 = props.groupby(['node1']).node2.apply(list)" ] }, { "cell_type": "code", "execution_count": 45, "id": "light-appreciation", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8100" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(props2)" ] }, { "cell_type": "code", "execution_count": 48, "id": "yellow-helmet", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2336, 8100)" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt = 0\n", "totalCnt = 0\n", "for prop in props2.index:\n", " totalCnt += 1\n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv\")):\n", " continue\n", " else:\n", " cnt += 1\n", "cnt, totalCnt" ] }, { "cell_type": "code", "execution_count": 50, "id": "detected-skiing", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "node1\n", "P10 [Q21502404, Q21510851, Q21510852, Q52004125, Q...\n", "P1000 [Q21510856, Q21510865, Q53869507]\n", "P1001 [Q21502838, Q21503250, Q21510865, Q25796498]\n", "P1002 [Q21503250, Q21510865]\n", "P1003 [Q19474404, Q21502404, Q21502410, Q21510851, Q...\n", " ... \n", "P1563 [Q19474404, Q21502404, Q21502410, Q21503247, Q...\n", "P1564 [Q19474404, Q21502404, Q21502410, Q21503247, Q...\n", "P1565 [Q19474404, Q21502404, Q21502410, Q21503247, Q...\n", "P1566 [Q19474404, Q21502404, Q21502410, Q21502838, Q...\n", "P1567 [Q19474404, Q21502404, Q21502410, Q21502838, Q...\n", "Name: node2, Length: 500, dtype: object" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "props2.head(500)" ] }, { "cell_type": "code", "execution_count": 32, "id": "processed-perfume", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "props2 = pd.read_csv(\"../../constraintsOP/claims.propList.tsv\", sep='\\t')" ] }, { "cell_type": "code", "execution_count": 33, "id": "increasing-graphics", "metadata": {}, "outputs": [], "source": [ "props2 = props2.groupby(['node1']).node2.apply(list)" ] }, { "cell_type": "code", "execution_count": 34, "id": "posted-ukraine", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "8193" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(props2)" ] }, { "cell_type": "code", "execution_count": 35, "id": "fifth-provision", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2415, 8193)" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cnt = 0\n", "totalCnt = 0\n", "for prop in props2.index:\n", " totalCnt += 1\n", " if not(os.path.isfile(\"../../propertiesSplitWRemoved2/claims.\"+ prop +\".tsv\")):\n", " continue\n", " else:\n", " cnt += 1\n", "cnt, totalCnt" ] }, { "cell_type": "code", "execution_count": 50, "id": "married-heating", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "node1\n", "P10 [Q21502404, Q21510851, Q21510852, Q52004125, Q...\n", "P1000 [Q21510856, Q21510865, Q53869507]\n", "P1001 [Q21502838, Q21503250, Q21510865, Q25796498]\n", "P1002 [Q21503250, Q21510865]\n", "P1003 [Q19474404, Q21502404, Q21502410, Q21510851, Q...\n", " ... \n", "P1563 [Q19474404, Q21502404, Q21502410, Q21503247, Q...\n", "P1564 [Q19474404, Q21502404, Q21502410, Q21503247, Q...\n", "P1565 [Q19474404, Q21502404, Q21502410, Q21503247, Q...\n", "P1566 [Q19474404, Q21502404, Q21502410, Q21502838, Q...\n", "P1567 [Q19474404, Q21502404, Q21502410, Q21502838, Q...\n", "Name: node2, Length: 500, dtype: object" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "props2.head(500)" ] }, { "cell_type": "code", "execution_count": null, "id": "magnetic-conditions", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgtkEnv", "language": "python", "name": "kgtkenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" }, "toc": { "base_numbering": 1, "nav_menu": {}, "number_sections": true, "sideBar": true, "skip_h1_title": false, "title_cell": "Table of Contents", "title_sidebar": "Contents", "toc_cell": false, "toc_position": { "height": "calc(100% - 180px)", "left": "10px", "top": "150px", "width": "318px" }, "toc_section_display": true, "toc_window_display": true }, "varInspector": { "cols": { "lenName": 16, "lenType": 16, "lenVar": 40 }, "kernels_config": { "python": { "delete_cmd_postfix": "", "delete_cmd_prefix": "del ", "library": "var_list.py", "varRefreshCmd": "print(var_dic_list())" }, "r": { "delete_cmd_postfix": ") ", "delete_cmd_prefix": "rm(", "library": "var_list.r", "varRefreshCmd": "cat(var_dic_list()) " } }, "oldHeight": 122, "position": { "height": "40px", "left": "1170px", "right": "20px", "top": "120px", "width": "250px" }, "types_to_exclude": [ "module", "function", "builtin_function_or_method", "instance", "_Feature" ], "varInspector_section_display": "none", "window_display": false } }, "nbformat": 4, "nbformat_minor": 5 }