{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# DataFrame Conversions\n", "\n", "This notebook demonstrates how to [convert](https://nexus-forge.readthedocs.io/en/latest/interaction.html#converting) a [Resource](https://nexus-forge.readthedocs.io/en/latest/interaction.html#resource) to pandas DataFrame and vice-versa." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:20.068658Z", "start_time": "2019-09-23T18:50:19.054054Z" } }, "outputs": [], "source": [ "from kgforge.core import KnowledgeGraphForge" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A configuration file is needed in order to create a KnowledgeGraphForge session. A configuration can be generated using the notebook [00-Initialization.ipynb](00%20-%20Initialization.ipynb)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "forge = KnowledgeGraphForge(\"../../configurations/forge.yml\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Imports" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-09-23T18:50:20.127987Z", "start_time": "2019-09-23T18:50:20.119390Z" } }, "outputs": [], "source": [ "from kgforge.core import Resource" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## List of Resources to DataFrame" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "address = Resource(type=\"PostalAddress\", country=\"Switzerland\", locality=\"Geneva\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "jane = Resource(type=\"Person\", name=\"Jane Doe\", address=address, email=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "john = Resource(type=\"Person\", name=\"John Smith\", email=\"john.smith@epfl.ch\")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "persons = [jane, john]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 2\n", " _register_many\n", " True\n" ] } ], "source": [ "forge.register(persons)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/99105664-6d99-45a5-90e8-82f58e45f36a',\n", " 'type': 'Person',\n", " 'address': {'type': 'PostalAddress',\n", " 'country': 'Switzerland',\n", " 'locality': 'Geneva'},\n", " 'email': '(missing)',\n", " 'name': 'Jane Doe'}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_json(jane)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/9b4976dc-6fb8-49fb-892d-a634d27eac3b',\n", " 'type': 'Person',\n", " 'email': 'john.smith@epfl.ch',\n", " 'name': 'John Smith'}" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_json(john)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'id': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/9b4976dc-6fb8-49fb-892d-a634d27eac3b',\n", " '_constrainedBy': 'https://bluebrain.github.io/nexus/schemas/unconstrained.json',\n", " '_createdAt': '2022-04-12T22:24:14.009Z',\n", " '_createdBy': 'https://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy',\n", " '_deprecated': False,\n", " '_incoming': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/9b4976dc-6fb8-49fb-892d-a634d27eac3b/incoming',\n", " '_outgoing': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/9b4976dc-6fb8-49fb-892d-a634d27eac3b/outgoing',\n", " '_project': 'https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge',\n", " '_rev': 1,\n", " '_schemaProject': 'https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge',\n", " '_self': 'https://bbp.epfl.ch/nexus/v1/resources/dke/kgforge/_/9b4976dc-6fb8-49fb-892d-a634d27eac3b',\n", " '_updatedAt': '2022-04-12T22:24:14.009Z',\n", " '_updatedBy': 'https://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy'}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "john._store_metadata" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailname
0https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonPostalAddressSwitzerlandGeneva(missing)Jane Doe
1https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person PostalAddress \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person NaN \n", "\n", " address.country address.locality email name \n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is possible to specify what values (here '(missing)') should be replaced by `NaN` using the `na` parameter." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailname
0https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonPostalAddressSwitzerlandGenevaNaNJane Doe
1https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person PostalAddress \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person NaN \n", "\n", " address.country address.locality email name \n", "0 Switzerland Geneva NaN Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, na=\"(missing)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It is possible to specify a string to use in the column names to show nested values, the default is dot `.`." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress__typeaddress__countryaddress__localityemailname
0https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonPostalAddressSwitzerlandGeneva(missing)Jane Doe
1https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " id type address__type \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person PostalAddress \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person NaN \n", "\n", " address__country address__locality email name \n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, nesting=\"__\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The `expanded` parameter will show fields and values according to the JSON-LD context." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
@id@typehttp://schema.org/addresshttp://schema.org/emailhttp://schema.org/name
0https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...[http://schema.org/Person][{'@type': ['https://bbp.epfl.ch/nexus/v1/reso...[{'@value': '(missing)'}][{'@value': 'Jane Doe'}]
1https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...[http://schema.org/Person]NaN[{'@value': 'john.smith@epfl.ch'}][{'@value': 'John Smith'}]
\n", "
" ], "text/plain": [ " @id \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "\n", " @type \\\n", "0 [http://schema.org/Person] \n", "1 [http://schema.org/Person] \n", "\n", " http://schema.org/address \\\n", "0 [{'@type': ['https://bbp.epfl.ch/nexus/v1/reso... \n", "1 NaN \n", "\n", " http://schema.org/email http://schema.org/name \n", "0 [{'@value': '(missing)'}] [{'@value': 'Jane Doe'}] \n", "1 [{'@value': 'john.smith@epfl.ch'}] [{'@value': 'John Smith'}] " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, expanded=True)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idtypeaddress.typeaddress.countryaddress.localityemailname_constrainedBy_createdAt_createdBy_deprecated_incoming_outgoing_project_rev_schemaProject_self_updatedAt_updatedBy
0https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonPostalAddressSwitzerlandGeneva(missing)Jane Doehttps://bluebrain.github.io/nexus/schemas/unco...2022-04-12T22:24:14.013Zhttps://bbp.epfl.ch/nexus/v1/realms/bbp/users/syFalsehttps://bbp.epfl.ch/nexus/v1/resources/dke/kgf...https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge1https://bbp.epfl.ch/nexus/v1/projects/dke/kgforgehttps://bbp.epfl.ch/nexus/v1/resources/dke/kgf...2022-04-12T22:24:14.013Zhttps://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy
1https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...PersonNaNNaNNaNjohn.smith@epfl.chJohn Smithhttps://bluebrain.github.io/nexus/schemas/unco...2022-04-12T22:24:14.009Zhttps://bbp.epfl.ch/nexus/v1/realms/bbp/users/syFalsehttps://bbp.epfl.ch/nexus/v1/resources/dke/kgf...https://bbp.epfl.ch/nexus/v1/resources/dke/kgf...https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge1https://bbp.epfl.ch/nexus/v1/projects/dke/kgforgehttps://bbp.epfl.ch/nexus/v1/resources/dke/kgf...2022-04-12T22:24:14.009Zhttps://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy
\n", "
" ], "text/plain": [ " id type address.type \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person PostalAddress \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... Person NaN \n", "\n", " address.country address.locality email name \\\n", "0 Switzerland Geneva (missing) Jane Doe \n", "1 NaN NaN john.smith@epfl.ch John Smith \n", "\n", " _constrainedBy \\\n", "0 https://bluebrain.github.io/nexus/schemas/unco... \n", "1 https://bluebrain.github.io/nexus/schemas/unco... \n", "\n", " _createdAt _createdBy \\\n", "0 2022-04-12T22:24:14.013Z https://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy \n", "1 2022-04-12T22:24:14.009Z https://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy \n", "\n", " _deprecated _incoming \\\n", "0 False https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "1 False https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "\n", " _outgoing \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "\n", " _project _rev \\\n", "0 https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge 1 \n", "1 https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge 1 \n", "\n", " _schemaProject \\\n", "0 https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge \n", "1 https://bbp.epfl.ch/nexus/v1/projects/dke/kgforge \n", "\n", " _self \\\n", "0 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "1 https://bbp.epfl.ch/nexus/v1/resources/dke/kgf... \n", "\n", " _updatedAt _updatedBy \n", "0 2022-04-12T22:24:14.013Z https://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy \n", "1 2022-04-12T22:24:14.009Z https://bbp.epfl.ch/nexus/v1/realms/bbp/users/sy " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "forge.as_dataframe(persons, store_metadata=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFrame to list of Resources" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame([\n", " {\n", " \"type\": \"Person\",\n", " \"address.type\": \"PostalAddress\",\n", " \"address.country\": \"Switzerland\",\n", " \"address.locality\": \"Geneva\",\n", " \"email\": \"(missing)\",\n", " \"name\": \"Jane Doe\",\n", " },\n", " {\n", " \"type\": \"Person\",\n", " \"address.type\": np.nan,\n", " \"address.country\": np.nan,\n", " \"address.locality\": np.nan,\n", " \"email\": \"john.smith@epfl.ch\",\n", " \"name\": \"John Smith\",\n", " }\n", "])" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
typeaddress.typeaddress.countryaddress.localityemailname
0PersonPostalAddressSwitzerlandGeneva(missing)Jane Doe
1PersonNaNNaNNaNjohn.smith@epfl.chJohn Smith
\n", "
" ], "text/plain": [ " type address.type address.country address.locality email \\\n", "0 Person PostalAddress Switzerland Geneva (missing) \n", "1 Person NaN NaN NaN john.smith@epfl.ch \n", "\n", " name \n", "0 Jane Doe \n", "1 John Smith " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "resources = forge.from_dataframe(data)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "address = Resource(type=\"PostalAddress\", country=\"Switzerland\", locality=\"Geneva\")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "jane = Resource(type=\"Person\", name=\"Jane Doe\", address=address, email=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "john = Resource(type=\"Person\", name=\"John Smith\", email=\"john.smith@epfl.ch\")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "persons = [jane, john]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "resources == persons" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "resources_na = forge.from_dataframe(data, na=\"(missing)\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources[0])" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources_na[0])" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "resources_nesting = forge.from_dataframe(data, nesting=\".\")" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources_nesting[0])" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "data = pd.DataFrame([\n", " {\n", " \"type\": \"Person\",\n", " \"address_type\": \"PostalAddress\",\n", " \"address_country\": \"Switzerland\",\n", " \"address_locality\": \"Geneva\",\n", " \"email\": \"(missing)\",\n", " \"name\": \"Jane Doe\",\n", " },\n", " {\n", " \"type\": \"Person\",\n", " \"address_type\": np.nan,\n", " \"address_country\": np.nan,\n", " \"address_locality\": np.nan,\n", " \"email\": \"john.smith@epfl.ch\",\n", " \"name\": \"John Smith\",\n", " }\n", "])" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "resources_nesting = forge.from_dataframe(data, nesting=\"_\")" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{\n", " type: Person\n", " address:\n", " {\n", " type: PostalAddress\n", " country: Switzerland\n", " locality: Geneva\n", " }\n", " email: (missing)\n", " name: Jane Doe\n", "}\n" ] } ], "source": [ "print(resources_nesting[0])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.7 (nexusforgelatest)", "language": "python", "name": "nexusforgelatest" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 4 }