{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploring data with SPARQL" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n", "\n", "---\n", "\n", "````{panels}\n", ":container: container-lg pb-3
\n", ":column: col-lg-3 col-md-4 col-sm-6 col-xs-12 p-1
\n", ":card: rounded
\n", "\n", "
\n", "^^^\n", "

Recipe metadata

\n", " identifier: FCB040\n", " version: v1.0\n", "\n", "---\n", "\n", "^^^\n", "

Difficulty level

\n", "\n", "\n", "\n", "\n", "\n", "\n", "---\n", "\n", "^^^\n", "

Reading Time

\n", " 30 minutes\n", "

Recipe Type

\n", " Hands-on\n", "

Executable Code

\n", " Yes\n", "\n", "---\n", "\n", "^^^\n", "

Intended Audience

\n", "\n", "

Data Manager

\n", "

Data Scientist

\n", "````\n", "\n", "---\n", "\n", "Background: \n", "\n", "Here, we show how to query the LinkedData graph using SPARQL to retrieve information about key study design design descriptors such as study group size and treatment groups." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import rdflib\n", "from rdflib import Graph, RDF\n", "from IPython.core.display import display, HTML\n", "import os\n", "import json\n", "import csv\n", "import uuid\n", "\n", "from SPARQLWrapper import SPARQLWrapper, SPARQLWrapper2, JSON, JSONLD, CSV, TSV, N3, RDF, RDFXML, TURTLE\n", "import pandas as pds\n", "import itertools\n", "\n", "import numpy as np\n", "from plotnine import *\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def queryResultToHTMLTable(queryResult):\n", " \n", " HTMLResult = ''\n", " # print variable names and build header:\n", " for varName in queryResult.vars:\n", " HTMLResult = HTMLResult + ''\n", " HTMLResult = HTMLResult + ''\n", "\n", " # print values from each row and build table of results\n", " for row in queryResult:\n", " HTMLResult = HTMLResult + '' \n", " for column in row:\n", " #print(\"COLUMN:\", column)\n", " if column is not \"\":\n", " HTMLResult = HTMLResult + ''\n", " else:\n", " HTMLResult = HTMLResult + ''\n", " HTMLResult = HTMLResult + ''\n", " HTMLResult = HTMLResult + '
' + varName + '
' + str(column) + '' + \"N/A\"+ '
'\n", " display(HTML(HTMLResult))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_sparql_variables(results, sparql_wrapper=\"SPARQLWrapper2\"):\n", "# return results.vars if (\"sparqlwrapper2\" == sparql_wrapper.lower()) else results['head']['vars']\n", " return results.vars if (\"sparqlwrapper2\" == sparql_wrapper.lower()) else results.vars\n", "\n", "\n", "def get_sparql_bindings(results, sparql_wrapper=\"SPARQLWrapper2\"):\n", " return results.bindings if (\"sparqlwrapper2\" == sparql_wrapper.lower()) else results['results']['bindings']\n", "\n", "\n", "def get_sparql_binding_variable_value(binding, variable, sparql_wrapper=\"SPARQLWrapper2\"):\n", " return binding[variable] if (\"sparqlwrapper2\" == sparql_wrapper.lower()) else binding[variable]['value']\n", "\n", "def make_sparql_dict_list(bindings, variables, sparql_wrapper=\"SPARQLWrapper2\"):\n", " def binding_value(binding, var): # helper function for returning values\n", " return get_sparql_binding_variable_value(binding, var, sparql_wrapper) if (var in binding) else None\n", "\n", " dict_list = [] # list to contain dictionaries\n", " for binding in itertools.chain(bindings):\n", " values = [binding_value(binding, var) for var in itertools.chain(variables)]\n", " dict_list.append(dict(zip(variables, values)))\n", "\n", " return dict_list\n", "\n", "\n", "def make_sparql_df(results, sparql_wrapper=\"SPARQLWrapper2\"):\n", " variables = get_sparql_variables(results, sparql_wrapper)\n", " # print(variables)\n", "\n", " cleaned_variables=[str(var.replace('\\\\n','')) for var in variables] \n", "\n", " #print(cleaned_variables)\n", " bindings = get_sparql_bindings(results, sparql_wrapper)\n", " # print(bindings)\n", " \n", " # create a list of dictionaries to use as data for dataframe\n", " data_list = make_sparql_dict_list(bindings, cleaned_variables, sparql_wrapper)\n", " \n", " # print(data_list)\n", "\n", " df = pds.DataFrame(data_list) # create dataframe from data list\n", " df[\"sample_mean\"] = df[\"sample_mean\"].astype(\"float\")\n", "\n", " # print(df[\"sample_mean\"])\n", " return df[cleaned_variables] # return dataframe with columns reordered" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "*credits to Bob du Charme for the following function [http://www.snee.com/bobdc.blog/2016/07/sparql-in-a-jupyter-aka-ipytho.html]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "g = Graph()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's read the RDF graph generated using the rose-dtpkg2rdf.py python script and saved to disk as a turtle file" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#g.parse(\"./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-subset.ttl\", format=\"n3\")\n", "g.parse(\"./../data/processed/rose-data-as-rdf/rose-aroma-ng-06-2018-full.ttl\", format=\"n3\")\n", "# g.parse(\"./../data/processed/denovo/rdf/rose-aroma-ng-06-2018-full.ttl\", format=\"n3\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's ask for the independent variables and their levels using the following SPARQL query" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_idv_and_levels = g.query(\"\"\"\n", "PREFIX stato: \n", "prefix rdfs: \n", "PREFIX ncbitax: \n", "prefix has_part: \n", " SELECT DISTINCT\n", " ?Predictor\n", " ?PredictorLevel\n", " WHERE { \n", " ?var a stato:0000087 ;\n", " rdfs:label ?Predictor ;\n", " has_part: ?value .\n", " ?value rdfs:label ?PredictorLevel . \n", " } \n", "\"\"\")\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can display the results of that query using the function declared earlier on" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "queryResultToHTMLTable(get_idv_and_levels)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's now ask for the number of biological and technical replicates used to compute the mean concentration of the chemical compounds detected and forming the signature of the rose fragrance" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_replication_info = g.query(\"\"\"\n", "prefix rdfs: \n", "prefix chmo: \n", "prefix msio: \n", "prefix stato: \n", "prefix obi: \n", "prefix ro: \n", "prefix po: \n", "prefix has_member: \n", "prefix has_value: \n", "prefix computed_from: \n", "prefix has_specified_input: \n", "prefix is_specified_output_of: \n", "prefix is_about: \n", "\n", "SELECT \n", " ?TreatmentGroup \n", " ?ChemicalCompound\n", " ?MeanConcentration\n", " (count(distinct ?member) as ?NbTechnicalReplicate) \n", " (count(distinct ?input) as ?NbBiologicalReplicate) \n", " WHERE {\n", " ?population a stato:0000193 ;\n", " rdfs:label ?TreatmentGroup ;\n", " has_member: ?member . \n", " ?member has_specified_input: ?input . \n", " ?mean a stato:0000402 ;\n", " computed_from: ?population ;\n", " has_value: ?MeanConcentration ;\n", " is_about: ?ChemicalCompound .\n", " ?concentration a stato:0000072;\n", " is_about: ?ChemicalCompound .\n", "}\n", " GROUP BY ?population \n", "\"\"\")\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Once more, we invoke the pretty printing function" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "queryResultToHTMLTable(get_replication_info)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "get_all_data = g.query(\"\"\"\n", "prefix rdfs: \n", "prefix chmo: \n", "prefix msio: \n", "prefix stato: \n", "prefix obi: \n", "prefix ro: \n", "prefix po: \n", "prefix chebi: \n", "prefix has_value: \n", "prefix computed_from: \n", "prefix is_about: \n", "prefix is_denoted_by: \n", "prefix derives_from: \n", "prefix located_in: \n", "prefix denotes: \n", "prefix measured_in: \n", "\n", "SELECT REDUCED ?chemical_name ?chebi_identifier ?inchi ?sample_mean ?sem ?treatment ?genotype ?organism_part\n", "WHERE {\n", " ?pop_mean a stato:0000402 ;\n", " is_about: ?chebi_identifier ;\n", " computed_from: ?population ;\n", " has_value: ?sample_mean .\n", " ?chem a ?chebi_identifier ;\n", " rdfs:label ?chemical_name ;\n", " is_denoted_by: ?inchi .\n", " ?semv a stato:0000037 ; \n", " denotes: ?pop_mean ;\n", " has_value: ?sem.\n", " ?population a stato:0000193 ;\n", " rdfs:label ?treatment .\n", " ?sub_conc a stato:0000072 ;\n", " derives_from: ?genotype ;\n", " located_in: ?organism_part;\n", " measured_in: ?population . \n", "} \n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "queryResultToHTMLTable(get_all_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data=make_sparql_df(get_all_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# width = figure_size[0]\n", "# height = figure_size[0] * aspect_ratio\n", "gray = '#666666'\n", "orange = '#FF8000'\n", "blue = '#3333FF'\n", "\n", "p1 = (ggplot(data)\n", " + aes('chemical_name','sample_mean',fill='factor(treatment)')\n", " + geom_col()\n", "+ facet_wrap('~treatment', dir='v',ncol=1)\n", " + scale_y_continuous(expand = (0,0))\n", " + theme(axis_text_x=element_text(rotation=90, hjust=1, fontsize=6, color=blue))\n", " + theme(axis_text_y=element_text(rotation=0, hjust=2, fontsize=6, color=orange))\n", " + theme(figure_size = (8, 16))\n", ")\n", "\n", "p1 + theme(panel_background=element_rect(fill=blue)\n", " )\n", "\n", "p1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv372", "language": "python", "name": "venv372" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" } }, "nbformat": 4, "nbformat_minor": 2 }