{
"cells": [
{
"cell_type": "markdown",
"id": "812c32ec-a4c7-4e62-b185-ef729d8e909f",
"metadata": {},
"source": [
"\n",
"# Data Reading examples\n",
"\n",
"pyJedAI needs as input a pandas.DataFrame. In this notebook we provide some examples of data reading and transformation to DataFrame.\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "86fd9820-c878-415f-a33c-735c22991f99",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "markdown",
"id": "b7ceb258-c9f5-4b10-a631-3bb6cebcbf06",
"metadata": {
"tags": []
},
"source": [
"# CSV Reader\n",
"\n",
"Example Dataset: CORA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "46ecf65d-6dd0-4c90-aafd-944155d4ec06",
"metadata": {},
"outputs": [],
"source": [
"d1 = pd.read_csv(\"../data/der/cora/cora.csv\", sep='|')\n",
"gt = pd.read_csv(\"../data/der/cora/cora_gt.csv\", sep='|', header=None)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "068b9de0-4847-464c-8d9d-ebeaedc51c1c",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Entity Id | \n",
" address | \n",
" author | \n",
" editor | \n",
" institution | \n",
" month | \n",
" note | \n",
" pages | \n",
" publisher | \n",
" title | \n",
" venue | \n",
" volume | \n",
" year | \n",
" Unnamed: 13 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" los alamitos, ca: | \n",
" p. auer, n. cesa-bianchi, y. freund, and r. e.... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" pp. 322-331. | \n",
" ieee computer society press, | \n",
" 'gambling in a rigged casino: the adversarial ... | \n",
" in proc. 36th annual symposium on foundations ... | \n",
" NaN | \n",
" 1995, | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Entity Id address \\\n",
"0 0 los alamitos, ca: \n",
"\n",
" author editor institution month \\\n",
"0 p. auer, n. cesa-bianchi, y. freund, and r. e.... NaN NaN NaN \n",
"\n",
" note pages publisher \\\n",
"0 NaN pp. 322-331. ieee computer society press, \n",
"\n",
" title \\\n",
"0 'gambling in a rigged casino: the adversarial ... \n",
"\n",
" venue volume year \\\n",
"0 in proc. 36th annual symposium on foundations ... NaN 1995, \n",
"\n",
" Unnamed: 13 \n",
"0 NaN "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d1.head(1)"
]
},
{
"cell_type": "markdown",
"id": "21f538d8-c6be-4ac5-b8f2-e6af10e9708c",
"metadata": {
"tags": []
},
"source": [
"# JSON Reader"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9b686608-b220-43ac-acf1-f3eb5046935e",
"metadata": {},
"outputs": [],
"source": [
"d1 = pd.read_json(\"../data/der/cora/cora.json\")\n",
"gt = pd.read_json(\"../data/der/cora/cora_gt.json\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "7c19d6c0-6dcc-4dc8-8a22-1e1a9bbfeb7d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Entity Id | \n",
" address | \n",
" author | \n",
" editor | \n",
" institution | \n",
" month | \n",
" note | \n",
" pages | \n",
" publisher | \n",
" title | \n",
" venue | \n",
" volume | \n",
" year | \n",
" Unnamed: 13 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" los alamitos, ca: | \n",
" p. auer, n. cesa-bianchi, y. freund, and r. e.... | \n",
" None | \n",
" None | \n",
" None | \n",
" None | \n",
" pp. 322-331. | \n",
" ieee computer society press, | \n",
" 'gambling in a rigged casino: the adversarial ... | \n",
" in proc. 36th annual symposium on foundations ... | \n",
" None | \n",
" 1995, | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Entity Id address \\\n",
"0 0 los alamitos, ca: \n",
"\n",
" author editor institution month \\\n",
"0 p. auer, n. cesa-bianchi, y. freund, and r. e.... None None None \n",
"\n",
" note pages publisher \\\n",
"0 None pp. 322-331. ieee computer society press, \n",
"\n",
" title \\\n",
"0 'gambling in a rigged casino: the adversarial ... \n",
"\n",
" venue volume year \\\n",
"0 in proc. 36th annual symposium on foundations ... None 1995, \n",
"\n",
" Unnamed: 13 \n",
"0 NaN "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d1.head(1)"
]
},
{
"cell_type": "markdown",
"id": "5b0e88a7-da53-402d-9033-9409508c4657",
"metadata": {
"tags": []
},
"source": [
"# Excel Reader"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ee000ec0-e6d1-470b-9bd2-63849ac7526d",
"metadata": {},
"outputs": [],
"source": [
"d1 = pd.read_excel(\"../data/der/cora/cora.xlsx\")\n",
"gt = pd.read_excel(\"../data/der/cora/cora_gt.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fccdc4f6-1a6a-410c-bcac-1c48909633ea",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" Entity Id | \n",
" address | \n",
" author | \n",
" editor | \n",
" institution | \n",
" month | \n",
" note | \n",
" pages | \n",
" publisher | \n",
" title | \n",
" venue | \n",
" volume | \n",
" year | \n",
" Unnamed: 13 | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" 0 | \n",
" los alamitos, ca: | \n",
" p. auer, n. cesa-bianchi, y. freund, and r. e.... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" pp. 322-331. | \n",
" ieee computer society press, | \n",
" 'gambling in a rigged casino: the adversarial ... | \n",
" in proc. 36th annual symposium on foundations ... | \n",
" NaN | \n",
" 1995, | \n",
" NaN | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 Entity Id address \\\n",
"0 0 0 los alamitos, ca: \n",
"\n",
" author editor institution month \\\n",
"0 p. auer, n. cesa-bianchi, y. freund, and r. e.... NaN NaN NaN \n",
"\n",
" note pages publisher \\\n",
"0 NaN pp. 322-331. ieee computer society press, \n",
"\n",
" title \\\n",
"0 'gambling in a rigged casino: the adversarial ... \n",
"\n",
" venue volume year \\\n",
"0 in proc. 36th annual symposium on foundations ... NaN 1995, \n",
"\n",
" Unnamed: 13 \n",
"0 NaN "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"d1.head(1)"
]
},
{
"cell_type": "markdown",
"id": "b50e04fa-4985-41dd-a941-4fbd48aaacf5",
"metadata": {
"tags": []
},
"source": [
"# RDF/OWL Reader"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "f696607e-9c41-419b-8267-6ee2a5d93736",
"metadata": {},
"outputs": [],
"source": [
"import rdfpandas as rfd\n",
"import rdflib\n",
"\n",
"rdfd1 = rdflib.Graph().parse('../data/rdf/restaurants/restaurant1.nt')\n",
"rdfd2 = rdflib.Graph().parse('../data/rdf/restaurants/restaurant2.nt')\n",
"\n",
"def rdf_to_df(graph_parsed) -> pd.DataFrame:\n",
" subject = []\n",
" predicate = []\n",
" rdfobject = []\n",
" df = pd.DataFrame(columns=['subject', 'predicate', 'object'])\n",
" for s, p, o in graph_parsed:\n",
" subject.append(s)\n",
" predicate.append(p)\n",
" rdfobject.append(o)\n",
" df['predicate'] = predicate\n",
" df['subject'] = subject\n",
" df['object'] = rdfobject\n",
"\n",
" return df\n",
" \n",
"d1 = rdf_to_df(rdfd1)\n",
"d2 = rdf_to_df(rdfd2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67a021eb-f0ba-4812-8cc6-33436b86c01b",
"metadata": {},
"outputs": [],
"source": [
"d1.head(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "38fcab96-62b4-4727-a15e-8b6db66735f5",
"metadata": {},
"outputs": [],
"source": [
"d2.head(2)"
]
},
{
"cell_type": "markdown",
"id": "f6c904d6-2c48-40f1-8315-7f16f3a34997",
"metadata": {
"tags": []
},
"source": [
"# Relational DBs Reader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "42a442a9-18fe-480c-a184-31a52583e630",
"metadata": {},
"outputs": [],
"source": [
"from sqlite3 import connect\n",
"conn = connect(':memory:')\n",
"d1.to_sql('d1', conn)\n",
"d2.to_sql('d2', conn)\n",
"gt.to_sql('gt', conn)\n",
"sql_d1 = pd.read_sql('SELECT * FROM d1', conn)\n",
"sql_d2 = pd.read_sql('SELECT * FROM d2', conn)\n",
"sql_gt = pd.read_sql('SELECT * FROM gt', conn)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f4656e17-c1d1-4d86-abd4-2c97b1e20743",
"metadata": {},
"outputs": [],
"source": [
"sql_d1.head(1)"
]
},
{
"cell_type": "markdown",
"id": "6913af6c-d636-48f7-8eb0-66151b0961d7",
"metadata": {},
"source": [
"### PostgreSQL"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c26c8c4b-dab4-447b-a475-0a0898966d34",
"metadata": {},
"outputs": [],
"source": [
"from sqlalchemy import create_engine\n",
"\n",
"POSTGRES_ADDRESS = 'db' ## INSERT YOUR DB ADDRESS\n",
"POSTGRES_PORT = '5439'\n",
"POSTGRES_USERNAME = 'username' ## CHANGE THIS TO YOUR POSTGRES USERNAME\n",
"POSTGRES_PASSWORD = 'root' ## CHANGE THIS TO YOUR POSTGRES PASSWORD\n",
"POSTGRES_DBNAME = 'database' ## CHANGE THIS TO YOUR DATABASE NAME\n",
"postgres_str = ('postgresql://{username}:{password}@{ipaddress}:{port}/{dbname}'.format(\n",
" username=POSTGRES_USERNAME,\n",
" password=POSTGRES_PASSWORD,\n",
" ipaddress=POSTGRES_ADDRESS,\n",
" port=POSTGRES_PORT,\n",
" dbname=POSTGRES_DBNAME\n",
"))\n",
"\n",
"# Create the connection\n",
"cnx = create_engine(postgres_str)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "262610a8-0ebc-449c-8697-3aae43757242",
"metadata": {},
"outputs": [],
"source": [
"pd.read_sql('SELECT * FROM d1', cnx)"
]
},
{
"cell_type": "markdown",
"id": "3baad4a6-5ceb-491a-8802-1df1bbb0853e",
"metadata": {
"tags": []
},
"source": [
"# SPARKQL Reader\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0b2d3ba-431f-4c9e-a3d7-eaace1e9207b",
"metadata": {},
"outputs": [],
"source": [
"from pandas import json_normalize\n",
"from SPARQLWrapper import SPARQLWrapper, JSON\n",
"\n",
"sparql = SPARQLWrapper(\"http://dbpedia.org/sparql\")\n",
"sparql.setQuery(\"\"\"\n",
" SELECT *\n",
" WHERE\n",
" {\n",
" ?athlete rdfs:label \"Cristiano Ronaldo\"@en ;\n",
" dbo:birthPlace ?place .\n",
" ?place a dbo:City ;\n",
" rdfs:label ?cityName .\n",
" }\n",
"\"\"\")\n",
"sparql.setReturnFormat(JSON)\n",
"results = sparql.query().convert()\n",
"d1 = json_normalize(results[\"results\"][\"bindings\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c63b73c-8ddd-449f-a88e-9026bd6d6070",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"d1"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}