{ "cells": [ { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:19.036357Z", "start_time": "2019-09-23T18:50:19.031896Z" } }, "source": [ "# Mapping\n", "\n", "Mappings are pre-defined configuration files that encode the logic on how to transform a specific data source into Resources that follow a template of a targeted _Type_. \n", "\n", "This notebook demonstrates the `DictionaryMapping` wich is based on a JSON structure that represent the target structure, and Python code that will apply desired transformations on the data source." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:20.068658Z", "start_time": "2019-09-23T18:50:19.054054Z" } }, "outputs": [], "source": [ "from kgforge.core import KnowledgeGraphForge" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "forge = KnowledgeGraphForge(\"../../configurations/demo-forge.yml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from kgforge.core import Resource" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from kgforge.specializations.mappings import DictionaryMapping" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "scientists = [\n", " {\n", " \"id\": 123,\n", " \"name\": \"Marie Curie\",\n", " \"gender\": \"female\",\n", " \"middle_name\": \"Salomea\",\n", " },\n", " {\n", " \"id\": 456,\n", " \"name\": \"Albert Einstein\",\n", " \"gender\": \"male\",\n", " \"middle_name\": \"(missing)\",\n", " },\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mapping data to the Knowledge Graph Schema" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### basics" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " DemoModel does not distinguish values and constraints in templates for now.\n", " DemoModel does not automatically include nested schemas for now.\n", "{\n", " type: Association\n", " agent:\n", " {\n", " type: Person\n", " name: hasattr\n", " }\n", "}\n" ] } ], "source": [ "forge.template(\"Association\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "mapping_simple = DictionaryMapping(\"\"\"\n", " type: Association\n", " agent:\n", " {\n", " type: Person\n", " name: x.name\n", " }\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "resources_simple = forge.map(scientists, mapping_simple)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Association\n", " agent:\n", " {\n", " type: Person\n", " name: Marie Curie\n", " }\n", "}\n" ] } ], "source": [ "print(resources_simple[0])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### missing values" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "mapping_na = DictionaryMapping(\"\"\"\n", " type: Association\n", " agent:\n", " {\n", " type: Person\n", " name: x.name\n", " additionalName: x.middle_name\n", " }\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Association\n", " agent:\n", " {\n", " type: Person\n", " additionalName: (missing)\n", " name: Albert Einstein\n", " }\n", "}\n" ] } ], "source": [ "print(forge.map(scientists[1], mapping_na))" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Association\n", " agent:\n", " {\n", " type: Person\n", " name: Albert Einstein\n", " }\n", "}\n" ] } ], "source": [ "print(forge.map(scientists[1], mapping_na, na=\"(missing)\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### multiple mappings" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "mapping_person = DictionaryMapping(\"\"\"\n", " id: forge.format(\"identifier\", \"persons\", x.id)\n", " type: Person\n", " name: x.name\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "mapping_association = DictionaryMapping(\"\"\"\n", " type: Association\n", " agent: forge.format(\"identifier\", \"persons\", x.id)\n", "\"\"\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "resources_graph = forge.map(scientists, [mapping_person, mapping_association])" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: https://kg.example.ch/persons/123\n", " type: Person\n", " name: Marie Curie\n", "}\n" ] } ], "source": [ "print(resources_graph[0])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Association\n", " agent: https://kg.example.ch/persons/123\n", "}\n" ] } ], "source": [ "print(resources_graph[1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### managed mappings" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data sources with managed mappings:\n", " - allen-cell-types-database\n", " - scientists-database\n" ] } ], "source": [ "forge.sources()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Managed mappings for the data source per entity type and mapping type:\n", " - Association:\n", " * DictionaryMapping\n" ] } ], "source": [ "forge.mappings(\"scientists-database\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "mapping = forge.mapping(\"Association\", \"scientists-database\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "resources = forge.map(scientists, mapping, na=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(resources)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "kgforge.core.resource.Resource" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(resources[0])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Association\n", " agent:\n", " {\n", " id: forge.format(\"identifier\", \"persons\", x.id)\n", " type: Person\n", " additionalName: x.middle_name\n", " gender: forge.resolve(x.gender, scope=\"terms\")\n", " name: x.name\n", " }\n", " distribution: forge.attach(f\"../../data/scientists-database/{'_'.join(x.name.lower().split())}.txt\")\n", "}\n" ] } ], "source": [ "print(mapping)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Association\n", " agent:\n", " {\n", " id: https://kg.example.ch/persons/123\n", " type: Person\n", " additionalName: Salomea\n", " gender:\n", " {\n", " id: http://purl.obolibrary.org/obo/PATO_0000383\n", " label: female\n", " }\n", " name: Marie Curie\n", " }\n", " distribution: LazyAction(operation=Store.upload, args=['../../data/scientists-database/marie_curie.txt'])\n", "}\n" ] } ], "source": [ "print(resources[0])" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# forge.register(resources)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Managing mappings" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "filepath = \"mappings/scientists-database/DictionaryMapping/Association.hjson\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### saving" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "mapping.save(filepath)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### tracking & sharing changes" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# ! cd mappings" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# ! git add Association.hjson" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# ! git commit -m \"Add Association mapping\"" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# ! git push" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### loading" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "loaded = DictionaryMapping.load(filepath)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# loaded == mapping" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgforge(v2)", "language": "python", "name": "kgforge" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 }