{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DataFrame Conversions\n", "\n", "This notebook demonstrates the Resources conversion to pandas DataFrame and vice-versa." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:20.068658Z", "start_time": "2019-09-23T18:50:19.054054Z" } }, "outputs": [], "source": [ "from kgforge.core import KnowledgeGraphForge" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "forge = KnowledgeGraphForge(\"../../configurations/demo-forge.yml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Imports" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from kgforge.core import Resource" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## List of Resources to DataFrame" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "address = Resource(type=\"PostalAddress\", country=\"Switzerland\", locality=\"Geneva\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "jane = Resource(id=\"33532569-70eb-4648-a7f1-f7ea22b0ce38\", type=\"Person\", name=\"Jane Doe\", address=address, email=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "john = Resource(id=\"45e018f4-9ade-4ad0-bdcf-63902bf51cc1\", type=\"Person\", name=\"John Smith\", email=\"john.smith@epfl.ch\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "persons = [jane, john]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2\n", " _register_one\n", " True\n" ] } ], "source": [ "forge.register(persons)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: 33532569-70eb-4648-a7f1-f7ea22b0ce38\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(jane)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: 45e018f4-9ade-4ad0-bdcf-63902bf51cc1\n", " type: Person\n", " email: john.smith@epfl.ch\n", " name: John Smith\n", "}\n" ] } ], "source": [ "print(john)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'version': 1, 'deprecated': False}\n" ] } ], "source": [ "print(john._store_metadata)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailname
033532569-70eb-4648-a7f1-f7ea22b0ce38PersonPostalAddressSwitzerlandGeneva(missing)Jane Doe
145e018f4-9ade-4ad0-bdcf-63902bf51cc1PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 33532569-70eb-4648-a7f1-f7ea22b0ce38 Person PostalAddress \n", "1 45e018f4-9ade-4ad0-bdcf-63902bf51cc1 Person NaN \n", "\n", " address.country address.locality email name \n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailname
033532569-70eb-4648-a7f1-f7ea22b0ce38PersonPostalAddressSwitzerlandGenevaNaNJane Doe
145e018f4-9ade-4ad0-bdcf-63902bf51cc1PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 33532569-70eb-4648-a7f1-f7ea22b0ce38 Person PostalAddress \n", "1 45e018f4-9ade-4ad0-bdcf-63902bf51cc1 Person NaN \n", "\n", " address.country address.locality email name \n", "0 Switzerland Geneva NaN Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, na=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress__typeaddress__countryaddress__localityemailname
033532569-70eb-4648-a7f1-f7ea22b0ce38PersonPostalAddressSwitzerlandGeneva(missing)Jane Doe
145e018f4-9ade-4ad0-bdcf-63902bf51cc1PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address__type \\\n", "0 33532569-70eb-4648-a7f1-f7ea22b0ce38 Person PostalAddress \n", "1 45e018f4-9ade-4ad0-bdcf-63902bf51cc1 Person NaN \n", "\n", " address__country address__locality email name \n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, nesting=\"__\")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
@id@typeschema:name
0file:///Users/agarcia/Developments/kgforge/exa...schema:PersonJane Doe
1file:///Users/agarcia/Developments/kgforge/exa...schema:PersonJohn Smith
\n", "
" ], "text/plain": [ " @id @type \\\n", "0 file:///Users/agarcia/Developments/kgforge/exa... schema:Person \n", "1 file:///Users/agarcia/Developments/kgforge/exa... schema:Person \n", "\n", " schema:name \n", "0 Jane Doe \n", "1 John Smith " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, expanded=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailnamedeprecatedversion
033532569-70eb-4648-a7f1-f7ea22b0ce38PersonPostalAddressSwitzerlandGeneva(missing)Jane DoeFalse1
145e018f4-9ade-4ad0-bdcf-63902bf51cc1PersonNaNNaNNaNjohn.smith@epfl.chJohn SmithFalse1
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 33532569-70eb-4648-a7f1-f7ea22b0ce38 Person PostalAddress \n", "1 45e018f4-9ade-4ad0-bdcf-63902bf51cc1 Person NaN \n", "\n", " address.country address.locality email name \\\n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith \n", "\n", " deprecated version \n", "0 False 1 \n", "1 False 1 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, store_metadata=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame to list of Resources" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame([\n", " {\n", " \"id\": \"33532569-70eb-4648-a7f1-f7ea22b0ce38\",\n", " \"type\": \"Person\",\n", " \"address.type\": \"PostalAddress\",\n", " \"address.country\": \"Switzerland\",\n", " \"address.locality\": \"Geneva\",\n", " \"email\": \"(missing)\",\n", " \"name\": \"Jane Doe\",\n", " },\n", " {\n", " \"id\": \"45e018f4-9ade-4ad0-bdcf-63902bf51cc1\",\n", " \"type\": \"Person\",\n", " \"address.type\": np.nan,\n", " \"address.country\": np.nan,\n", " \"address.locality\": np.nan,\n", " \"email\": \"john.smith@epfl.ch\",\n", " \"name\": \"John Smith\",\n", " }\n", "])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailname
033532569-70eb-4648-a7f1-f7ea22b0ce38PersonPostalAddressSwitzerlandGeneva(missing)Jane Doe
145e018f4-9ade-4ad0-bdcf-63902bf51cc1PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 33532569-70eb-4648-a7f1-f7ea22b0ce38 Person PostalAddress \n", "1 45e018f4-9ade-4ad0-bdcf-63902bf51cc1 Person NaN \n", "\n", " address.country address.locality email name \n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "resources = forge.from_dataframe(data)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "address = Resource(type=\"PostalAddress\", country=\"Switzerland\", locality=\"Geneva\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "jane = Resource(id=\"33532569-70eb-4648-a7f1-f7ea22b0ce38\", type=\"Person\", name=\"Jane Doe\", address=address, email=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "john = Resource(id=\"45e018f4-9ade-4ad0-bdcf-63902bf51cc1\", type=\"Person\", name=\"John Smith\", email=\"john.smith@epfl.ch\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "persons = [jane, john]" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resources == persons" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "resources_na = forge.from_dataframe(data, na=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: 33532569-70eb-4648-a7f1-f7ea22b0ce38\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources[0])" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: 33532569-70eb-4648-a7f1-f7ea22b0ce38\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources_na[0])" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "resources_nesting = forge.from_dataframe(data, nesting=\"__\")" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: 33532569-70eb-4648-a7f1-f7ea22b0ce38\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources[0])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " id: 33532569-70eb-4648-a7f1-f7ea22b0ce38\n", " type: Person\n", " address.country: Switzerland\n", " address.locality: Geneva\n", " address.type: PostalAddress\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources_nesting[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "kgforge(v2)", "language": "python", "name": "kgforge" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.5" } }, "nbformat": 4, "nbformat_minor": 4 }