{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-23T18:50:19.036357Z",
"start_time": "2019-09-23T18:50:19.031896Z"
}
},
"source": [
"# Datasets\n",
"\n",
"A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.\n",
"\n",
"Note: commented lines are not implemented on the Demo Store"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"ExecuteTime": {
"end_time": "2019-09-23T18:50:20.068658Z",
"start_time": "2019-09-23T18:50:19.054054Z"
}
},
"outputs": [],
"source": [
"from kgforge.core import KnowledgeGraphForge"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"forge = KnowledgeGraphForge(\"../../configurations/demo-forge.yml\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"from kgforge.core import Resource"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from kgforge.specializations.resources import Dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creation with files"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"associations.tsv\n",
"persons.csv\n"
]
}
],
"source": [
"! ls -p ../../data | egrep -v /$"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"jane = Resource(type=\"Person\", name=\"Jane Doe\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"persons = Dataset(forge, name=\"Interesting Persons\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"persons.add_files(\"../../data/persons.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"persons.add_contribution(jane)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# forge.register(persons)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" type: Dataset\n",
" contribution:\n",
" {\n",
" type: Contribution\n",
" agent:\n",
" {\n",
" id:\n",
" {\n",
" type: Person\n",
" name: Jane Doe\n",
" }\n",
" type: Agent\n",
" }\n",
" }\n",
" hasPart: LazyAction(operation=Store.upload, args=['../../data/persons.csv'])\n",
" name: Interesting Persons\n",
"}\n"
]
}
],
"source": [
"print(persons)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"associations = Dataset(forge, name=\"Associations data\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"associations.add_files(\"../../data/associations.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# associations.add_derivation(persons)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"associations.add_contribution(jane)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# forge.register(associations)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" type: Dataset\n",
" contribution:\n",
" {\n",
" type: Contribution\n",
" agent:\n",
" {\n",
" id:\n",
" {\n",
" type: Person\n",
" name: Jane Doe\n",
" }\n",
" type: Agent\n",
" }\n",
" }\n",
" hasPart: LazyAction(operation=Store.upload, args=['../../data/associations.tsv'])\n",
" name: Associations data\n",
"}\n"
]
}
],
"source": [
"print(associations)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# associations.download(\"files\", \"./downloaded/\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"# ! ls ./downloaded"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"# ! rm -R ./downloaded"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creation with resources"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"distribution_1 = forge.attach(\"../../data/associations.tsv\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"distribution_2 = forge.attach(\"../../data/persons.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"jane = Resource(type=\"Person\", name=\"Jane Doe\", distribution=distribution_1)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"john = Resource(type=\"Person\", name=\"John Smith\", distribution=distribution_2)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"persons = [jane, john]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"# forge.register(persons)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"dataset = Dataset(forge, name=\"Interesting people\")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"# dataset.add_parts(persons)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# forge.register(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"# dataset.download(\"parts\", \"./downloaded/\")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"# ! ls ./downloaded"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Creation from a dataframe"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### basics"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"dataframe = pd.read_csv(\"../../data/persons.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" type | \n",
" name | \n",
" distribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Person | \n",
" Marie Curie | \n",
" ../../data/scientists-database/marie_curie.txt | \n",
"
\n",
" \n",
" 1 | \n",
" Person | \n",
" Albert Einstein | \n",
" ../../data/scientists-database/albert_einstein... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" type name distribution\n",
"0 Person Marie Curie ../../data/scientists-database/marie_curie.txt\n",
"1 Person Albert Einstein ../../data/scientists-database/albert_einstein..."
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataframe"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"persons = forge.from_dataframe(dataframe)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2\n",
" _register_one\n",
" True\n"
]
}
],
"source": [
"forge.register(persons)"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"dataset = Dataset(forge, name=\"Interesting people\")"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"dataset.add_parts(persons)"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\n",
" type: Dataset\n",
" hasPart:\n",
" [\n",
" {\n",
" id: edbc5599-cd7c-4825-b97e-97da45bc0b8c_version=1\n",
" type: Person\n",
" distribution: ../../data/scientists-database/marie_curie.txt\n",
" name: Marie Curie\n",
" }\n",
" {\n",
" id: 6571c60b-e0a2-4d80-846e-0f116a44921a_version=1\n",
" type: Person\n",
" distribution: ../../data/scientists-database/albert_einstein.txt\n",
" name: Albert Einstein\n",
" }\n",
" ]\n",
" name: Interesting people\n",
"}\n"
]
}
],
"source": [
"print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" _register_one\n",
" True\n"
]
}
],
"source": [
"forge.register(dataset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### advanced"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"dataframe = pd.read_csv(\"../../data/associations.tsv\", sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id | \n",
" name | \n",
" type | \n",
" agent__type | \n",
" agent__name | \n",
" agent__gender__id | \n",
" agent__gender__type | \n",
" agent__gender__label | \n",
" distribution | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" https://kg.example.ch/associations/123 | \n",
" Curie Association | \n",
" Association | \n",
" Person | \n",
" Marie Curie | \n",
" http://purl.obolibrary.org/obo/PATO_0000383 | \n",
" LabeledOntologyEntity | \n",
" female | \n",
" ../../data/scientists-database/marie_curie.txt | \n",
"
\n",
" \n",
" 1 | \n",
" (missing) | \n",
" Einstein Association | \n",
" Association | \n",
" Person | \n",
" Albert Einstein | \n",
" http://purl.obolibrary.org/obo/PATO_0000384 | \n",
" LabeledOntologyEntity | \n",
" male | \n",
" ../../data/scientists-database/albert_einstein... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id name type \\\n",
"0 https://kg.example.ch/associations/123 Curie Association Association \n",
"1 (missing) Einstein Association Association \n",
"\n",
" agent__type agent__name agent__gender__id \\\n",
"0 Person Marie Curie http://purl.obolibrary.org/obo/PATO_0000383 \n",
"1 Person Albert Einstein http://purl.obolibrary.org/obo/PATO_0000384 \n",
"\n",
" agent__gender__type agent__gender__label \\\n",
"0 LabeledOntologyEntity female \n",
"1 LabeledOntologyEntity male \n",
"\n",
" distribution \n",
"0 ../../data/scientists-database/marie_curie.txt \n",
"1 ../../data/scientists-database/albert_einstein... "
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dataframe"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"dataframe[\"distribution\"] = dataframe[\"distribution\"].map(lambda x: forge.attach(x))"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"associations = forge.from_dataframe(dataframe, na=\"(missing)\", nesting=\"__\")"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"# forge.register(associations)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"# dataset = Dataset(forge, name=\"Interesting associations\")"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"# dataset.add_parts(associations)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# print(dataset)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# forge.register(dataset)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "kgforge(v2)",
"language": "python",
"name": "kgforge"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}