{ "cells": [ { "cell_type": "markdown", "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:19.036357Z", "start_time": "2019-09-23T18:50:19.031896Z" } }, "source": [ "# Datasets\n", "\n", "A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.\n", "\n", "Note: commented lines are not implemented on the Demo Store" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:20.068658Z", "start_time": "2019-09-23T18:50:19.054054Z" } }, "outputs": [], "source": [ "from kgforge.core import KnowledgeGraphForge" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "forge = KnowledgeGraphForge(\"../../configurations/demo-forge.yml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from kgforge.core import Resource" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from kgforge.specializations.resources import Dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creation with files" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "associations.tsv\n", "persons.csv\n" ] } ], "source": [ "! ls -p ../../data | egrep -v /$" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "jane = Resource(type=\"Person\", name=\"Jane Doe\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "persons = Dataset(forge, name=\"Interesting Persons\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "persons.add_files(\"../../data/persons.csv\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "persons.add_contribution(jane)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# forge.register(persons)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Dataset\n", " contribution:\n", " {\n", " type: Contribution\n", " agent:\n", " {\n", " id:\n", " {\n", " type: Person\n", " name: Jane Doe\n", " }\n", " type: Agent\n", " }\n", " }\n", " hasPart: LazyAction(operation=Store.upload, args=['../../data/persons.csv'])\n", " name: Interesting Persons\n", "}\n" ] } ], "source": [ "print(persons)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "associations = Dataset(forge, name=\"Associations data\")" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "associations.add_files(\"../../data/associations.tsv\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# associations.add_derivation(persons)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "associations.add_contribution(jane)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# forge.register(associations)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Dataset\n", " contribution:\n", " {\n", " type: Contribution\n", " agent:\n", " {\n", " id:\n", " {\n", " type: Person\n", " name: Jane Doe\n", " }\n", " type: Agent\n", " }\n", " }\n", " hasPart: LazyAction(operation=Store.upload, args=['../../data/associations.tsv'])\n", " name: Associations data\n", "}\n" ] } ], "source": [ "print(associations)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "# associations.download(\"files\", \"./downloaded/\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# ! ls ./downloaded" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "# ! rm -R ./downloaded" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creation with resources" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "distribution_1 = forge.attach(\"../../data/associations.tsv\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "distribution_2 = forge.attach(\"../../data/persons.csv\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "jane = Resource(type=\"Person\", name=\"Jane Doe\", distribution=distribution_1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "john = Resource(type=\"Person\", name=\"John Smith\", distribution=distribution_2)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "persons = [jane, john]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# forge.register(persons)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dataset = Dataset(forge, name=\"Interesting people\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# dataset.add_parts(persons)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# print(dataset)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# forge.register(dataset)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# dataset.download(\"parts\", \"./downloaded/\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# ! ls ./downloaded" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Creation from a dataframe" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### basics" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "dataframe = pd.read_csv(\"../../data/persons.csv\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typenamedistribution
0PersonMarie Curie../../data/scientists-database/marie_curie.txt
1PersonAlbert Einstein../../data/scientists-database/albert_einstein...
\n", "
" ], "text/plain": [ " type name distribution\n", "0 Person Marie Curie ../../data/scientists-database/marie_curie.txt\n", "1 Person Albert Einstein ../../data/scientists-database/albert_einstein..." ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "persons = forge.from_dataframe(dataframe)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2\n", " _register_one\n", " True\n" ] } ], "source": [ "forge.register(persons)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "dataset = Dataset(forge, name=\"Interesting people\")" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "dataset.add_parts(persons)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Dataset\n", " hasPart:\n", " [\n", " {\n", " id: edbc5599-cd7c-4825-b97e-97da45bc0b8c_version=1\n", " type: Person\n", " distribution: ../../data/scientists-database/marie_curie.txt\n", " name: Marie Curie\n", " }\n", " {\n", " id: 6571c60b-e0a2-4d80-846e-0f116a44921a_version=1\n", " type: Person\n", " distribution: ../../data/scientists-database/albert_einstein.txt\n", " name: Albert Einstein\n", " }\n", " ]\n", " name: Interesting people\n", "}\n" ] } ], "source": [ "print(dataset)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " _register_one\n", " True\n" ] } ], "source": [ "forge.register(dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### advanced" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "dataframe = pd.read_csv(\"../../data/associations.tsv\", sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idnametypeagent__typeagent__nameagent__gender__idagent__gender__typeagent__gender__labeldistribution
0https://kg.example.ch/associations/123Curie AssociationAssociationPersonMarie Curiehttp://purl.obolibrary.org/obo/PATO_0000383LabeledOntologyEntityfemale../../data/scientists-database/marie_curie.txt
1(missing)Einstein AssociationAssociationPersonAlbert Einsteinhttp://purl.obolibrary.org/obo/PATO_0000384LabeledOntologyEntitymale../../data/scientists-database/albert_einstein...
\n", "
" ], "text/plain": [ " id name type \\\n", "0 https://kg.example.ch/associations/123 Curie Association Association \n", "1 (missing) Einstein Association Association \n", "\n", " agent__type agent__name agent__gender__id \\\n", "0 Person Marie Curie http://purl.obolibrary.org/obo/PATO_0000383 \n", "1 Person Albert Einstein http://purl.obolibrary.org/obo/PATO_0000384 \n", "\n", " agent__gender__type agent__gender__label \\\n", "0 LabeledOntologyEntity female \n", "1 LabeledOntologyEntity male \n", "\n", " distribution \n", "0 ../../data/scientists-database/marie_curie.txt \n", "1 ../../data/scientists-database/albert_einstein... " ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataframe" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "dataframe[\"distribution\"] = dataframe[\"distribution\"].map(lambda x: forge.attach(x))" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "associations = forge.from_dataframe(dataframe, na=\"(missing)\", nesting=\"__\")" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "# forge.register(associations)" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# dataset = Dataset(forge, name=\"Interesting associations\")" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "# dataset.add_parts(associations)" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "# print(dataset)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# forge.register(dataset)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgforge(v2)", "language": "python", "name": "kgforge" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 }