{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Meta functionalities of the EpiGraphDB platform" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In this notebook we show the following aspects of the EpiGraphDB platform, and how to use the API to get the information:\n", "\n", "1. Metadata: meta nodes and meta edges, and the overall schema.\n", "2. Search for a specific node under the meta node.\n", "3. Cypher: how to query the database directly using Neo4j Cypher\n", "\n", "For detailed documentation on the API endpoints please visit:\n", "\n", "- The Swagger interface: http://api.epigraphdb.org\n", "- The sections regarding API endpoints on the documentation site: http://docs.epigraphdb.org/api/api-endpoints/" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pprint import pformat\n", "\n", "import networkx as nx\n", "import pandas as pd\n", "import requests" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "API_URL = \"https://api.epigraphdb.org\"\n", "\n", "requests.get(f\"{API_URL}/ping\").json()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Metadata" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we query for the metadata information using the endpoint `GET /meta/schema`, which will be used for downstream processing." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "dict_keys(['nodes', 'edges', 'connections']) \n", "\n", "# nodes:\n", "{'Disease': {'count': 21829,\n", " 'properties': {'definition': {'indexed': False,\n", " 'type': 'STRING',\n", " 'unique': False},\n", " 'doid': {'indexed': False,\n", " 'type': 'LIST',\n", " 'unique': False},\n", " 'efo': {'indexed': False,\n", " 'type': 'LIST',\n", " 'unique': False},\n", " 'icd10': {'indexed': False,\n", " 'type': 'LIST',\n", " 'unique': False},\n", " 'icd9': {'indexed': False,\n", " 'type': 'LIST',\n", " 'unique': False},\n", " 'id': {'indexed': True,\n", " 'type': 'STRING',\n", " \n", "\n", "# edges:\n", "{'BN_GEN_COR': {'count': 904832,\n", " 'properties': {'gcov_int': {'array': False, 'type': 'FLOAT'},\n", " 'gcov_int_se': {'array': False, 'type': 'FLOAT'},\n", " 'h2_int': {'array': False, 'type': 'FLOAT'},\n", " 'h2_int_se': {'array': False, 'type': 'FLOAT'},\n", " 'h2_obs': {'array': False, 'type': 'FLOAT'},\n", " 'h2_obs_se': {'array': False, 'type': 'FLOAT'},\n", " 'p': {'array': False, 'type': 'FLOAT'},\n", " 'rg': {'array': False, 'type': 'FLOAT'},\n", " 'se': {'array': False, 'type': 'FLOAT'},\n", " 'z': {'array': False, 'type': 'FLOAT'}}},\n", " 'CPIC': {'count': 355,\n", " 'properties': {'cpic_level': {'array': False, 'type': 'STRING'},\n", " 'guideline': {'array': False, 'type': 'STRING'},\n", " 'pgx_on_fda_ \n", "\n", "# connections:\n", "[{'count': 2486,\n", " 'from_node': 'Drug',\n", " 'rel': 'OPENTARGETS_DRUG_TO_DISEASE',\n", " 'to_node': 'Disease'},\n", " {'count': 3414,\n", " 'from_node': 'Disease',\n", " 'rel': 'MONDO_MAP_UMLS',\n", " 'to_node': 'SemmedTerm'},\n", " {'count': 626,\n", " 'from_node': 'Protein',\n", " 'rel': 'PROTEIN_TO_DISEASE',\n", " 'to_node': 'Disease'},\n", " {'count': 2822,\n", " 'from_node': 'Disease',\n", " 'rel': 'MONDO_MAP_EFO',\n", " 'to_node': 'Efo'},\n", " {'count': 541,\n", " 'from_node': 'Pathway',\n", " 'rel': 'PATHWAY_TO_DISEASE',\n", " 'to_node': 'Disease'},\n", " {'count': 41706,\n", " 'from_node': 'SemmedTerm',\n", " 'rel': 'SEM_GENE',\n", " 'to_node': 'Gene'},\n", " {'count': 3428531,\n", " 'from_node': 'SemmedTriple',\n", " 'rel': 'SEM_SUB',\n", " 'to_node': 'SemmedTerm'},\n", " {'count': 2081,\n", " 'from_node': 'Gwas',\n", " 'rel': 'METAMAP_LITE',\n", " 'to_node': 'SemmedTerm'},\n", " {'count': 3428531,\n", " 'from_node': 'SemmedTerm',\n", " 'rel': 'SEM_PREDICATE',\n", " 'to_node': 'SemmedTerm'},\n", " {'count': 3428531,\n", " 'from_node': 'SemmedTriple',\n", " 'rel': 'SEM_OBJ',\n", " 'to_node': 'SemmedTerm'},\n", " {'count': 12488,\n", " 'from_n \n", "\n" ] } ], "source": [ "endpoint = \"/meta/schema\"\n", "params = {\"graphviz\": False, \"plot\": False}\n", "r = requests.get(f\"{API_URL}{endpoint}\", params=params)\n", "r.raise_for_status()\n", "metadata = r.json()\n", "\n", "# Preview of metadata information\n", "keys = metadata.keys()\n", "print(pformat(keys), \"\\n\")\n", "for key in list(keys):\n", " print(f\"# {key}:\")\n", " print(pformat(metadata[key])[:1000], \"\\n\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Meta nodes\n", "\n", "We can extract the specific meta node information as a pandas dataframe from the metadata." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countproperties
Disease21,829{'doid': {'type': 'LIST', 'indexed': False, 'u...
Drug2,455{'molecule_type': {'type': 'STRING', 'indexed'...
Efo25,390{'type': {'type': 'STRING', 'indexed': False, ...
Event11,868{'name': {'type': 'STRING', 'indexed': False, ...
Gene59,171{'druggability_tier': {'type': 'STRING', 'inde...
Gwas31,773{'note': {'type': 'STRING', 'indexed': False, ...
Literature29,137,785{'pubmed_id': {'type': 'STRING', 'indexed': Tr...
Pathway2,180{'name': {'type': 'STRING', 'indexed': False, ...
Protein21,543{'uniprot_id': {'type': 'STRING', 'indexed': T...
SemmedTerm103,967{'name': {'type': 'STRING', 'indexed': True, '...
SemmedTriple3,428,531{'subject_id': {'type': 'STRING', 'indexed': F...
Tissue53{'tissue': {'type': 'STRING', 'indexed': True,...
Variant88,176{'name': {'type': 'STRING', 'indexed': True, '...
\n", "
" ], "text/plain": [ " count properties\n", "Disease 21,829 {'doid': {'type': 'LIST', 'indexed': False, 'u...\n", "Drug 2,455 {'molecule_type': {'type': 'STRING', 'indexed'...\n", "Efo 25,390 {'type': {'type': 'STRING', 'indexed': False, ...\n", "Event 11,868 {'name': {'type': 'STRING', 'indexed': False, ...\n", "Gene 59,171 {'druggability_tier': {'type': 'STRING', 'inde...\n", "Gwas 31,773 {'note': {'type': 'STRING', 'indexed': False, ...\n", "Literature 29,137,785 {'pubmed_id': {'type': 'STRING', 'indexed': Tr...\n", "Pathway 2,180 {'name': {'type': 'STRING', 'indexed': False, ...\n", "Protein 21,543 {'uniprot_id': {'type': 'STRING', 'indexed': T...\n", "SemmedTerm 103,967 {'name': {'type': 'STRING', 'indexed': True, '...\n", "SemmedTriple 3,428,531 {'subject_id': {'type': 'STRING', 'indexed': F...\n", "Tissue 53 {'tissue': {'type': 'STRING', 'indexed': True,...\n", "Variant 88,176 {'name': {'type': 'STRING', 'indexed': True, '..." ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "meta_node_df = pd.DataFrame.from_dict(metadata[\"nodes\"], orient=\"index\")\n", "\n", "(\n", " meta_node_df.sort_index().assign(\n", " count=lambda df: df[\"count\"].apply(lambda x: f\"{x:,}\")\n", " )\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Meta relationships and connections\n", "\n", "We can also extract the meta relationship (edge) information, and the connections." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countpropertiesfrom_nodeto_node
MONDO_MAP_EFO2,822NoneDiseaseEfo
MONDO_MAP_UMLS3,414NoneDiseaseSemmedTerm
OPENTARGETS_DRUG_TO_DISEASE2,486NoneDrugDisease
CPIC355{'pharmgkb_level_of_evidence': {'array': False...DrugGene
OPENTARGETS_DRUG_TO_TARGET6,024{'phase': {'array': False, 'type': 'STRING'}, ...DrugGene
EFO_CHILD_OF43,154NoneEfoEfo
PRECEDING_EVENT10,418NoneEventEvent
INTACT_INTERACTS_WITH_GENE_GENE2{'intact_confidence_score': {'array': False, '...GeneGene
XQTL_MULTI_SNP_MR3,098,049{'p': {'array': False, 'type': 'FLOAT'}, 'se':...GeneGwas
XQTL_SINGLE_SNP_MR_GENE_GWAS8,703,863{'p': {'array': False, 'type': 'FLOAT'}, 'rsid...GeneGwas
GENE_TO_LITERATURE771NoneGeneLiterature
INTACT_INTERACTS_WITH_GENE_PROTEIN1,451{'intact_confidence_score': {'array': False, '...GeneProtein
GENE_TO_PROTEIN20,762NoneGeneProtein
EXPRESSED_IN861,552{'tpm': {'array': False, 'type': 'FLOAT'}}GeneTissue
GWAS_NLP_EFO6,936{'score': {'array': False, 'type': 'FLOAT'}}GwasEfo
BN_GEN_COR904,832{'p': {'array': False, 'type': 'FLOAT'}, 'se':...GwasGwas
PRS132,703{'p': {'array': False, 'type': 'FLOAT'}, 'r2':...GwasGwas
MR583,619{'b': {'array': False, 'type': 'FLOAT'}, 'se':...GwasGwas
OBS_COR17,932{'cor': {'array': False, 'type': 'FLOAT'}}GwasGwas
GWAS_NLP30,838,964{'score': {'array': False, 'type': 'FLOAT'}}GwasGwas
GWAS_TO_LIT19,079,468NoneGwasLiterature
METAMAP_LITE2,081{'mmi_score': {'array': False, 'type': 'FLOAT'}}GwasSemmedTerm
GWAS_SEM9,075,020{'globalTotal': {'array': False, 'type': 'INTE...GwasSemmedTriple
TOPHITS122,730{'pval': {'array': False, 'type': 'FLOAT'}, 'b...GwasVariant
GWAS_TO_VARIANT26,521{'se': {'array': False, 'type': 'FLOAT'}, 'nca...GwasVariant
PATHWAY_TO_DISEASE541NonePathwayDisease
EVENT_IN_PATHWAY12,488NonePathwayEvent
PATHWAY_TO_LITERATURE8,952NonePathwayLiterature
PROTEIN_TO_DISEASE626NoneProteinDisease
PROTEIN_IN_EVENT13,484NoneProteinEvent
PROTEIN_TO_LITERATURE107,315NoneProteinLiterature
PROTEIN_IN_PATHWAY9,955NoneProteinPathway
INTACT_NOT_INTERACTS_WITH699{'intact_confidence_score': {'array': False, '...ProteinProtein
STRING_INTERACT_WITH390,222{'score': {'array': False, 'type': 'INTEGER'}}ProteinProtein
INTACT_INTERACTS_WITH_PROTEIN_PROTEIN187,426{'intact_confidence_score': {'array': False, '...ProteinProtein
SEM_GENE41,706NoneSemmedTermGene
SEM_PREDICATE3,428,531{'count': {'array': False, 'type': 'INTEGER'},...SemmedTermSemmedTerm
SEM_TO_LIT6,127,985NoneSemmedTripleLiterature
SEM_OBJ3,428,531NoneSemmedTripleSemmedTerm
SEM_SUB3,428,531NoneSemmedTripleSemmedTerm
VARIANT_TO_GENE59,157{'feature_type': {'array': False, 'type': 'STR...VariantGene
XQTL_SINGLE_SNP_MR_SNP_GENE41,564NoneVariantGene
\n", "
" ], "text/plain": [ " count \\\n", "MONDO_MAP_EFO 2,822 \n", "MONDO_MAP_UMLS 3,414 \n", "OPENTARGETS_DRUG_TO_DISEASE 2,486 \n", "CPIC 355 \n", "OPENTARGETS_DRUG_TO_TARGET 6,024 \n", "EFO_CHILD_OF 43,154 \n", "PRECEDING_EVENT 10,418 \n", "INTACT_INTERACTS_WITH_GENE_GENE 2 \n", "XQTL_MULTI_SNP_MR 3,098,049 \n", "XQTL_SINGLE_SNP_MR_GENE_GWAS 8,703,863 \n", "GENE_TO_LITERATURE 771 \n", "INTACT_INTERACTS_WITH_GENE_PROTEIN 1,451 \n", "GENE_TO_PROTEIN 20,762 \n", "EXPRESSED_IN 861,552 \n", "GWAS_NLP_EFO 6,936 \n", "BN_GEN_COR 904,832 \n", "PRS 132,703 \n", "MR 583,619 \n", "OBS_COR 17,932 \n", "GWAS_NLP 30,838,964 \n", "GWAS_TO_LIT 19,079,468 \n", "METAMAP_LITE 2,081 \n", "GWAS_SEM 9,075,020 \n", "TOPHITS 122,730 \n", "GWAS_TO_VARIANT 26,521 \n", "PATHWAY_TO_DISEASE 541 \n", "EVENT_IN_PATHWAY 12,488 \n", "PATHWAY_TO_LITERATURE 8,952 \n", "PROTEIN_TO_DISEASE 626 \n", "PROTEIN_IN_EVENT 13,484 \n", "PROTEIN_TO_LITERATURE 107,315 \n", "PROTEIN_IN_PATHWAY 9,955 \n", "INTACT_NOT_INTERACTS_WITH 699 \n", "STRING_INTERACT_WITH 390,222 \n", "INTACT_INTERACTS_WITH_PROTEIN_PROTEIN 187,426 \n", "SEM_GENE 41,706 \n", "SEM_PREDICATE 3,428,531 \n", "SEM_TO_LIT 6,127,985 \n", "SEM_OBJ 3,428,531 \n", "SEM_SUB 3,428,531 \n", "VARIANT_TO_GENE 59,157 \n", "XQTL_SINGLE_SNP_MR_SNP_GENE 41,564 \n", "\n", " properties \\\n", "MONDO_MAP_EFO None \n", "MONDO_MAP_UMLS None \n", "OPENTARGETS_DRUG_TO_DISEASE None \n", "CPIC {'pharmgkb_level_of_evidence': {'array': False... \n", "OPENTARGETS_DRUG_TO_TARGET {'phase': {'array': False, 'type': 'STRING'}, ... \n", "EFO_CHILD_OF None \n", "PRECEDING_EVENT None \n", "INTACT_INTERACTS_WITH_GENE_GENE {'intact_confidence_score': {'array': False, '... \n", "XQTL_MULTI_SNP_MR {'p': {'array': False, 'type': 'FLOAT'}, 'se':... \n", "XQTL_SINGLE_SNP_MR_GENE_GWAS {'p': {'array': False, 'type': 'FLOAT'}, 'rsid... \n", "GENE_TO_LITERATURE None \n", "INTACT_INTERACTS_WITH_GENE_PROTEIN {'intact_confidence_score': {'array': False, '... \n", "GENE_TO_PROTEIN None \n", "EXPRESSED_IN {'tpm': {'array': False, 'type': 'FLOAT'}} \n", "GWAS_NLP_EFO {'score': {'array': False, 'type': 'FLOAT'}} \n", "BN_GEN_COR {'p': {'array': False, 'type': 'FLOAT'}, 'se':... \n", "PRS {'p': {'array': False, 'type': 'FLOAT'}, 'r2':... \n", "MR {'b': {'array': False, 'type': 'FLOAT'}, 'se':... \n", "OBS_COR {'cor': {'array': False, 'type': 'FLOAT'}} \n", "GWAS_NLP {'score': {'array': False, 'type': 'FLOAT'}} \n", "GWAS_TO_LIT None \n", "METAMAP_LITE {'mmi_score': {'array': False, 'type': 'FLOAT'}} \n", "GWAS_SEM {'globalTotal': {'array': False, 'type': 'INTE... \n", "TOPHITS {'pval': {'array': False, 'type': 'FLOAT'}, 'b... \n", "GWAS_TO_VARIANT {'se': {'array': False, 'type': 'FLOAT'}, 'nca... \n", "PATHWAY_TO_DISEASE None \n", "EVENT_IN_PATHWAY None \n", "PATHWAY_TO_LITERATURE None \n", "PROTEIN_TO_DISEASE None \n", "PROTEIN_IN_EVENT None \n", "PROTEIN_TO_LITERATURE None \n", "PROTEIN_IN_PATHWAY None \n", "INTACT_NOT_INTERACTS_WITH {'intact_confidence_score': {'array': False, '... \n", "STRING_INTERACT_WITH {'score': {'array': False, 'type': 'INTEGER'}} \n", "INTACT_INTERACTS_WITH_PROTEIN_PROTEIN {'intact_confidence_score': {'array': False, '... \n", "SEM_GENE None \n", "SEM_PREDICATE {'count': {'array': False, 'type': 'INTEGER'},... \n", "SEM_TO_LIT None \n", "SEM_OBJ None \n", "SEM_SUB None \n", "VARIANT_TO_GENE {'feature_type': {'array': False, 'type': 'STR... \n", "XQTL_SINGLE_SNP_MR_SNP_GENE None \n", "\n", " from_node to_node \n", "MONDO_MAP_EFO Disease Efo \n", "MONDO_MAP_UMLS Disease SemmedTerm \n", "OPENTARGETS_DRUG_TO_DISEASE Drug Disease \n", "CPIC Drug Gene \n", "OPENTARGETS_DRUG_TO_TARGET Drug Gene \n", "EFO_CHILD_OF Efo Efo \n", "PRECEDING_EVENT Event Event \n", "INTACT_INTERACTS_WITH_GENE_GENE Gene Gene \n", "XQTL_MULTI_SNP_MR Gene Gwas \n", "XQTL_SINGLE_SNP_MR_GENE_GWAS Gene Gwas \n", "GENE_TO_LITERATURE Gene Literature \n", "INTACT_INTERACTS_WITH_GENE_PROTEIN Gene Protein \n", "GENE_TO_PROTEIN Gene Protein \n", "EXPRESSED_IN Gene Tissue \n", "GWAS_NLP_EFO Gwas Efo \n", "BN_GEN_COR Gwas Gwas \n", "PRS Gwas Gwas \n", "MR Gwas Gwas \n", "OBS_COR Gwas Gwas \n", "GWAS_NLP Gwas Gwas \n", "GWAS_TO_LIT Gwas Literature \n", "METAMAP_LITE Gwas SemmedTerm \n", "GWAS_SEM Gwas SemmedTriple \n", "TOPHITS Gwas Variant \n", "GWAS_TO_VARIANT Gwas Variant \n", "PATHWAY_TO_DISEASE Pathway Disease \n", "EVENT_IN_PATHWAY Pathway Event \n", "PATHWAY_TO_LITERATURE Pathway Literature \n", "PROTEIN_TO_DISEASE Protein Disease \n", "PROTEIN_IN_EVENT Protein Event \n", "PROTEIN_TO_LITERATURE Protein Literature \n", "PROTEIN_IN_PATHWAY Protein Pathway \n", "INTACT_NOT_INTERACTS_WITH Protein Protein \n", "STRING_INTERACT_WITH Protein Protein \n", "INTACT_INTERACTS_WITH_PROTEIN_PROTEIN Protein Protein \n", "SEM_GENE SemmedTerm Gene \n", "SEM_PREDICATE SemmedTerm SemmedTerm \n", "SEM_TO_LIT SemmedTriple Literature \n", "SEM_OBJ SemmedTriple SemmedTerm \n", "SEM_SUB SemmedTriple SemmedTerm \n", "VARIANT_TO_GENE Variant Gene \n", "XQTL_SINGLE_SNP_MR_SNP_GENE Variant Gene " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "meta_rel_df = pd.DataFrame.from_dict(metadata[\"edges\"], orient=\"index\").merge(\n", " pd.DataFrame.from_dict(\n", " {_[\"rel\"]: _ for _ in metadata[\"connections\"]}, orient=\"index\"\n", " )[[\"from_node\", \"to_node\"]],\n", " left_index=True,\n", " right_index=True,\n", ")\n", "\n", "(\n", " meta_rel_df.sort_values(by=[\"from_node\", \"to_node\"]).assign(\n", " count=lambda df: df[\"count\"].apply(lambda x: f\"{x:,}\")\n", " )\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Schema plot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can generate a network diagram of the graph db schema using `networkx`." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "graph = nx.from_pandas_edgelist(\n", " meta_rel_df, source=\"from_node\", target=\"to_node\"\n", ")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "nx.draw(\n", " G=graph,\n", " pos=nx.kamada_kawai_layout(graph),\n", " with_labels=True,\n", " node_color=\"white\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A detailed version of the shema plot can be obtained from the API:\n", "\n", "![schema_plot](http://ieu-mrbssd1.epi.bris.ac.uk:28046/meta/schema?graphviz=true&plot=true)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Search for specific node" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Users can use [the explorer on the Web UI](http://dev.epigraphdb.org/explore) to search for a specific node by:\n", "\n", "- fuzzy matching by \"name\" field.\n", "- exact matching by \"ID\" field if you know the its ID (e.g. the ID to a GWAS from IEU GWAS Database).\n", "\n", "Here we show how these are done at the API level using `Gwas` nodes as an example." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "First we need to know what the \"ID\" and \"name\" fields are for the meta nodes using `GET /meta/nodes/id-name-schema`:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Disease': {'id': 'id', 'name': 'label'},\n", " 'Drug': {'id': 'label', 'name': 'label'},\n", " 'Efo': {'id': 'id', 'name': 'value'},\n", " 'Event': {'id': 'reactome_id', 'name': 'name'},\n", " 'Gene': {'id': 'ensembl_id', 'name': 'name'},\n", " 'Tissue': {'id': 'tissue', 'name': 'tissue'},\n", " 'Gwas': {'id': 'id', 'name': 'trait'},\n", " 'Literature': {'id': 'pubmed_id', 'name': 'pubmed_id'},\n", " 'Pathway': {'id': 'reactome_id', 'name': 'name'},\n", " 'Protein': {'id': 'uniprot_id', 'name': 'uniprot_id'},\n", " 'SemmedTerm': {'id': 'id', 'name': 'name'},\n", " 'Variant': {'id': 'name', 'name': 'name'}}" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "r = requests.get(f\"{API_URL}/meta/nodes/id-name-schema\")\n", "r.raise_for_status()\n", "\n", "meta_node_fields = r.json()\n", "meta_node_fields" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Fuzzy matching" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Here we search for nodes can contain \"body mass index\" in their traits." ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'metadata': {'empty_results': False,\n", " 'query': 'MATCH (node: Gwas) WHERE node.trait =~ \"(?i).*body '\n", " 'mass index.*\" RETURN node LIMIT 10;',\n", " 'total_seconds': 0.009114},\n", " 'results': [{'node': {'access': 'public',\n", " 'author': 'Hoffmann TJ',\n", " 'category': 'NA',\n", " 'consortium': 'NA',\n", " 'id': 'ebi-a-GCST006368',\n", " 'mr': '1',\n", " 'note': 'NA',\n", " 'nsnp': '27854527',\n", " 'pmid': '30108127',\n", " 'population': 'European',\n", " 'priority': '0',\n", " 'sample_size': '315347',\n", " 'sex': 'NA',\n", " 'subcategory': 'NA',\n", " 'trait': 'Body mass index',\n", " 'unit': 'NA',\n", " 'year': '2018'}},\n", " {'node': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-2',\n", " 'mr': '1',\n", " 'nsnp': '2555511',\n", " 'pmid': '25673413',\n", " 'population': 'Mixed',\n", " 'priority': '1',\n", " 'sample_size': '339224',\n", " 'sd': '4.77',\n", " 'sex': 'Males and Females',\n", " 'subcategory': 'Anthropometric',\n", " 'trait': 'Body mass index',\n", " 'unit': 'NA',\n", " 'year': '2015'}},\n", " {'node': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-785',\n", " 'mr': '1',\n", " 'nsnp': '2477659',\n", " 'pmid': '25673413',\n", " 'population': 'European',\n", " 'priority': '2',\n", " 'sample_size': '152893',\n", " 'sd': '4.77',\n", " 'sex': 'Males',\n", " 'subcategory': 'Anthropometric',\n", " 'trait': 'Body mass index',\n", " 'unit': 'NA',\n", " 'year': '2015'}},\n", " {'node': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-835',\n", " 'mr': '1',\n", " 'nsnp': '2554668',\n", " 'pmid': '25673413',\n", " 'population': 'European',\n", " 'priority': '3',\n", " 'sample_size': '322154',\n", " 'sd': '4.77',\n", " 'sex': 'Males and Females',\n", "\n" ] } ], "source": [ "name = \"body mass index\"\n", "\n", "r = requests.get(f\"{API_URL}/meta/nodes/Gwas/search\", params={\"name\": name})\n", "r.raise_for_status()\n", "\n", "print(pformat(r.json())[:3000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Exact matching" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Similarly, we can exact match a specific node by its ID." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'metadata': {'empty_results': False,\n", " 'query': 'MATCH (node: Gwas {id: \"ieu-a-2\"}) RETURN node LIMIT '\n", " '10;',\n", " 'total_seconds': 0.002578},\n", " 'results': [{'node': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-2',\n", " 'mr': '1',\n", " 'nsnp': '2555511',\n", " 'pmid': '25673413',\n", " 'population': 'Mixed',\n", " 'priority': '1',\n", " 'sample_size': '339224',\n", " 'sd': '4.77',\n", " 'sex': 'Males and Females',\n", " 'subcategory': 'Anthropometric',\n", " 'trait': 'Body mass index',\n", " 'unit': 'NA',\n", " 'year': '2015'}}]}\n" ] } ], "source": [ "id = \"ieu-a-2\"\n", "\n", "r = requests.get(f\"{API_URL}/meta/nodes/Gwas/search\", params={\"id\": id})\n", "r.raise_for_status()\n", "\n", "print(pformat(r.json())[:3000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Cypher (advanced)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Advanced users that are familiar with Neo4j Cypher can query the database using Cypher directly." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'metadata': {'empty_results': False,\n", " 'query': 'MATCH (exposure:Gwas)-[mr:MR]->(outcome:Gwas) WHERE '\n", " 'exposure.trait = \"Body mass index\" RETURN exposure, '\n", " 'outcome, mr LIMIT 2',\n", " 'total_seconds': 0.049648},\n", " 'results': [{'exposure': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-2',\n", " 'mr': '1',\n", " 'nsnp': '2555511',\n", " 'pmid': '25673413',\n", " 'population': 'Mixed',\n", " 'priority': '1',\n", " 'sample_size': '339224',\n", " 'sd': '4.77',\n", " 'sex': 'Males and Females',\n", " 'subcategory': 'Anthropometric',\n", " 'trait': 'Body mass index',\n", " 'unit': 'NA',\n", " 'year': '2015'},\n", " 'mr': {'b': 0.0030348598957061768,\n", " 'ci_low': -0.002742477459833026,\n", " 'ci_upp': 0.008812196552753448,\n", " 'log10pval': 1.0,\n", " 'method': 'Weighted median',\n", " 'moescore': 0.7799999713897705,\n", " 'nsnp': 77,\n", " 'pval': 0.3032084107398987,\n", " 'se': 0.002947675297036767,\n", " 'selection': 'DF + HF'},\n", " 'outcome': {'access': 'public',\n", " 'author': 'Neale',\n", " 'category': 'NA',\n", " 'consortium': 'Neale Lab',\n", " 'id': 'ukb-a-99',\n", " 'mr': '1',\n", " 'ncase': '8718',\n", " 'ncontrol': '328441',\n", " 'note': 'NA',\n", " 'nsnp': '10894596',\n", " 'population': 'European',\n", " 'priority': '1',\n", " 'sample_size': '337159',\n", " 'sex': 'Males and Females',\n", " 'subcategory': 'NA',\n", " 'trait': 'Non-cancer illness code self-reported: '\n", " 'eczema/dermatitis',\n", " 'unit': 'SD',\n", " 'year': '2017'}},\n", " {'exposure': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-2',\n", " 'mr': '1',\n", " 'nsnp': '2555511',\n", " 'pmid': '25673413',\n", " 'population': 'Mixed',\n", " 'priority': '1',\n", " 'sample_size': '339224',\n", "\n" ] } ], "source": [ "query = \"\"\"\n", " MATCH (exposure:Gwas)-[mr:MR]->(outcome:Gwas) \n", " WHERE exposure.trait = \"Body mass index\"\n", " RETURN exposure, outcome, mr LIMIT 2\n", "\"\"\"\n", "\n", "r = requests.post(f\"{API_URL}/cypher\", json={\"query\": query})\n", "r.raise_for_status()\n", "\n", "print(pformat(r.json())[:3000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively we provide an endpoint `POST /cypher/builder/plain` that assist users in querying for simple cypher queries." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'metadata': {'empty_results': False,\n", " 'query': 'MATCH (source_node:Gwas) -[rel:MR]- (target_node:Gwas) '\n", " \"WHERE source_node.trait = 'Body mass index' RETURN \"\n", " 'source_node, rel, target_node LIMIT 2',\n", " 'total_seconds': 0.035151},\n", " 'results': [{'rel': {'b': 0.0030348598957061768,\n", " 'ci_low': -0.002742477459833026,\n", " 'ci_upp': 0.008812196552753448,\n", " 'log10pval': 1.0,\n", " 'method': 'Weighted median',\n", " 'moescore': 0.7799999713897705,\n", " 'nsnp': 77,\n", " 'pval': 0.3032084107398987,\n", " 'se': 0.002947675297036767,\n", " 'selection': 'DF + HF'},\n", " 'source_node': {'access': 'public',\n", " 'author': 'Locke AE',\n", " 'category': 'Risk factor',\n", " 'consortium': 'NA',\n", " 'id': 'ieu-a-2',\n", " 'mr': '1',\n", " 'nsnp': '2555511',\n", " 'pmid': '25673413',\n", " 'population': 'Mixed',\n", " 'priority': '1',\n", " 'sample_size': '339224',\n", " 'sd': '4.77',\n", " 'sex': 'Males and Females',\n", " 'subcategory': 'Anthropometric',\n", " 'trait': 'Body mass index',\n", " 'unit': 'NA',\n", " 'year': '2015'},\n", " 'target_node': {'access': 'public',\n", " 'author': 'Neale',\n", " 'category': 'NA',\n", " 'consortium': 'Neale Lab',\n", " 'id': 'ukb-a-99',\n", " 'mr': '1',\n", " 'ncase': '8718',\n", " 'ncontrol': '328441',\n", " 'note': 'NA',\n", " 'nsnp': '10894596',\n", " 'population': 'European',\n", " 'priority': '1',\n", " 'sample_size': '337159',\n", " 'sex': 'Males and Females',\n", " 'subcategory': 'NA',\n", " 'trait': 'Non-cancer illness code '\n", " 'self-reported: eczema/dermatitis',\n", " 'unit': 'SD',\n", " 'year': '2017'}},\n", " {'rel': {'b': -3.053751788684167e-05,\n", " 'ci_low': -0.00041146361036226153,\n", " 'ci_upp': 0.00035038855276070535,\n", " 'log10pval': 0.0,\n", " 'method': 'RE IVW',\n", " 'moescore': 0.800000011920929,\n", " 'nsnp': 79,\n", " 'pval'\n" ] } ], "source": [ "payload = {\n", " \"source_meta_node\": \"Gwas\",\n", " \"target_meta_node\": \"Gwas\",\n", " \"meta_rel\": \"MR\",\n", " \"where\": [\"source_node.trait = 'Body mass index'\"],\n", " \"limit\": 2,\n", "}\n", "\n", "r = requests.post(f\"{API_URL}/cypher/builder/plain\", json=payload)\n", "r.raise_for_status()\n", "\n", "print(pformat(r.json())[:3000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Again for the detailed documentation on the API endpoints please visit:\n", "\n", "- The Swagger interface: http://api.epigraphdb.org\n", "- The sections regarding API endpoints on the documentation site: http://docs.epigraphdb.org/api/api-endpoints/" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" }, "toc-autonumbering": false }, "nbformat": 4, "nbformat_minor": 4 }