{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-23T18:50:19.036357Z",
     "start_time": "2019-09-23T18:50:19.031896Z"
    }
   },
   "source": [
    "# Datasets\n",
    "\n",
    "A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.\n",
    "\n",
    "Note: commented lines are not implemented on the Demo Store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-09-23T18:50:20.068658Z",
     "start_time": "2019-09-23T18:50:19.054054Z"
    }
   },
   "outputs": [],
   "source": [
    "from kgforge.core import KnowledgeGraphForge"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "forge = KnowledgeGraphForge(\"../../configurations/demo-forge.yml\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from kgforge.core import Resource"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from kgforge.specializations.resources import Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creation with files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "associations.tsv\n",
      "persons.csv\n"
     ]
    }
   ],
   "source": [
    "! ls -p ../../data | egrep -v /$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "jane = Resource(type=\"Person\", name=\"Jane Doe\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "persons = Dataset(forge, name=\"Interesting Persons\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "persons.add_files(\"../../data/persons.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "persons.add_contribution(jane)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forge.register(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    type: Dataset\n",
      "    contribution:\n",
      "    {\n",
      "        type: Contribution\n",
      "        agent:\n",
      "        {\n",
      "            id:\n",
      "            {\n",
      "                type: Person\n",
      "                name: Jane Doe\n",
      "            }\n",
      "            type: Agent\n",
      "        }\n",
      "    }\n",
      "    hasPart: LazyAction(operation=Store.upload, args=['../../data/persons.csv'])\n",
      "    name: Interesting Persons\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "associations = Dataset(forge, name=\"Associations data\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "associations.add_files(\"../../data/associations.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# associations.add_derivation(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "associations.add_contribution(jane)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forge.register(associations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    type: Dataset\n",
      "    contribution:\n",
      "    {\n",
      "        type: Contribution\n",
      "        agent:\n",
      "        {\n",
      "            id:\n",
      "            {\n",
      "                type: Person\n",
      "                name: Jane Doe\n",
      "            }\n",
      "            type: Agent\n",
      "        }\n",
      "    }\n",
      "    hasPart: LazyAction(operation=Store.upload, args=['../../data/associations.tsv'])\n",
      "    name: Associations data\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(associations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# associations.download(\"files\", \"./downloaded/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ! ls ./downloaded"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ! rm -R ./downloaded"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creation with resources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "distribution_1 = forge.attach(\"../../data/associations.tsv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "distribution_2 = forge.attach(\"../../data/persons.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "jane = Resource(type=\"Person\", name=\"Jane Doe\", distribution=distribution_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "john = Resource(type=\"Person\", name=\"John Smith\", distribution=distribution_2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "persons = [jane, john]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forge.register(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = Dataset(forge, name=\"Interesting people\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset.add_parts(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forge.register(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset.download(\"parts\", \"./downloaded/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ! ls ./downloaded"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Creation from a dataframe"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### basics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataframe = pd.read_csv(\"../../data/persons.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "      <th>name</th>\n",
       "      <th>distribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Person</td>\n",
       "      <td>Marie Curie</td>\n",
       "      <td>../../data/scientists-database/marie_curie.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Person</td>\n",
       "      <td>Albert Einstein</td>\n",
       "      <td>../../data/scientists-database/albert_einstein...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     type             name                                       distribution\n",
       "0  Person      Marie Curie     ../../data/scientists-database/marie_curie.txt\n",
       "1  Person  Albert Einstein  ../../data/scientists-database/albert_einstein..."
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "persons = forge.from_dataframe(dataframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<count> 2\n",
      "<action> _register_one\n",
      "<succeeded> True\n"
     ]
    }
   ],
   "source": [
    "forge.register(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = Dataset(forge, name=\"Interesting people\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset.add_parts(persons)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{\n",
      "    type: Dataset\n",
      "    hasPart:\n",
      "    [\n",
      "        {\n",
      "            id: edbc5599-cd7c-4825-b97e-97da45bc0b8c_version=1\n",
      "            type: Person\n",
      "            distribution: ../../data/scientists-database/marie_curie.txt\n",
      "            name: Marie Curie\n",
      "        }\n",
      "        {\n",
      "            id: 6571c60b-e0a2-4d80-846e-0f116a44921a_version=1\n",
      "            type: Person\n",
      "            distribution: ../../data/scientists-database/albert_einstein.txt\n",
      "            name: Albert Einstein\n",
      "        }\n",
      "    ]\n",
      "    name: Interesting people\n",
      "}\n"
     ]
    }
   ],
   "source": [
    "print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<action> _register_one\n",
      "<succeeded> True\n"
     ]
    }
   ],
   "source": [
    "forge.register(dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### advanced"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataframe = pd.read_csv(\"../../data/associations.tsv\", sep=\"\\t\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "      <th>type</th>\n",
       "      <th>agent__type</th>\n",
       "      <th>agent__name</th>\n",
       "      <th>agent__gender__id</th>\n",
       "      <th>agent__gender__type</th>\n",
       "      <th>agent__gender__label</th>\n",
       "      <th>distribution</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>https://kg.example.ch/associations/123</td>\n",
       "      <td>Curie Association</td>\n",
       "      <td>Association</td>\n",
       "      <td>Person</td>\n",
       "      <td>Marie Curie</td>\n",
       "      <td>http://purl.obolibrary.org/obo/PATO_0000383</td>\n",
       "      <td>LabeledOntologyEntity</td>\n",
       "      <td>female</td>\n",
       "      <td>../../data/scientists-database/marie_curie.txt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>(missing)</td>\n",
       "      <td>Einstein Association</td>\n",
       "      <td>Association</td>\n",
       "      <td>Person</td>\n",
       "      <td>Albert Einstein</td>\n",
       "      <td>http://purl.obolibrary.org/obo/PATO_0000384</td>\n",
       "      <td>LabeledOntologyEntity</td>\n",
       "      <td>male</td>\n",
       "      <td>../../data/scientists-database/albert_einstein...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                       id                  name         type  \\\n",
       "0  https://kg.example.ch/associations/123     Curie Association  Association   \n",
       "1                               (missing)  Einstein Association  Association   \n",
       "\n",
       "  agent__type      agent__name                            agent__gender__id  \\\n",
       "0      Person      Marie Curie  http://purl.obolibrary.org/obo/PATO_0000383   \n",
       "1      Person  Albert Einstein  http://purl.obolibrary.org/obo/PATO_0000384   \n",
       "\n",
       "     agent__gender__type agent__gender__label  \\\n",
       "0  LabeledOntologyEntity               female   \n",
       "1  LabeledOntologyEntity                 male   \n",
       "\n",
       "                                        distribution  \n",
       "0     ../../data/scientists-database/marie_curie.txt  \n",
       "1  ../../data/scientists-database/albert_einstein...  "
      ]
     },
     "execution_count": 43,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataframe[\"distribution\"] = dataframe[\"distribution\"].map(lambda x: forge.attach(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "associations = forge.from_dataframe(dataframe, na=\"(missing)\", nesting=\"__\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forge.register(associations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset = Dataset(forge, name=\"Interesting associations\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataset.add_parts(associations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "# print(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# forge.register(dataset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "kgforge(v2)",
   "language": "python",
   "name": "kgforge"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}