# Datasets

A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.

Note: commented lines are not implemented on the Demo Store

In [1]:
from kgforge.core import KnowledgeGraphForge

In [2]:
forge = KnowledgeGraphForge("../../configurations/demo-forge.yml")

## Imports

In [3]:
from kgforge.core import Resource

In [4]:
from kgforge.specializations.resources import Dataset

In [5]:
import pandas as pd

## Creation with files

In [6]:
! ls -p ../../data | egrep -v /$

associations.tsv
persons.csv


In [7]:
jane = Resource(type="Person", name="Jane Doe")

In [8]:
persons = Dataset(forge, name="Interesting Persons")

In [9]:
persons.add_files("../../data/persons.csv")

In [10]:
persons.add_contribution(jane)

In [11]:
# forge.register(persons)

In [12]:
print(persons)

{
    type: Dataset
    contribution:
    {
        type: Contribution
        agent:
        {
            id:
            {
                type: Person
                name: Jane Doe
            }
            type: Agent
        }
    }
    hasPart: LazyAction(operation=Store.upload, args=['../../data/persons.csv'])
    name: Interesting Persons
}


In [13]:
associations = Dataset(forge, name="Associations data")

In [14]:
associations.add_files("../../data/associations.tsv")

In [15]:
# associations.add_derivation(persons)

In [16]:
associations.add_contribution(jane)

In [17]:
# forge.register(associations)

In [18]:
print(associations)

{
    type: Dataset
    contribution:
    {
        type: Contribution
        agent:
        {
            id:
            {
                type: Person
                name: Jane Doe
            }
            type: Agent
        }
    }
    hasPart: LazyAction(operation=Store.upload, args=['../../data/associations.tsv'])
    name: Associations data
}


In [19]:
# associations.download("files", "./downloaded/")

In [20]:
# ! ls ./downloaded

In [21]:
# ! rm -R ./downloaded

## Creation with resources

In [22]:
distribution_1 = forge.attach("../../data/associations.tsv")

In [23]:
distribution_2 = forge.attach("../../data/persons.csv")

In [24]:
jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)

In [25]:
john = Resource(type="Person", name="John Smith", distribution=distribution_2)

In [26]:
persons = [jane, john]

In [27]:
# forge.register(persons)

In [28]:
dataset = Dataset(forge, name="Interesting people")

In [29]:
# dataset.add_parts(persons)

In [30]:
# print(dataset)

In [31]:
# forge.register(dataset)

In [32]:
# dataset.download("parts", "./downloaded/")

In [33]:
# ! ls ./downloaded

## Creation from a dataframe

See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame.

### basics

In [34]:
dataframe = pd.read_csv("../../data/persons.csv")

In [35]:
dataframe

Unnamed: 0,type,name,distribution
0,Person,Marie Curie,../../data/scientists-database/marie_curie.txt
1,Person,Albert Einstein,../../data/scientists-database/albert_einstein...


In [36]:
persons = forge.from_dataframe(dataframe)

In [37]:
forge.register(persons)

<count> 2
<action> _register_one
<succeeded> True


In [38]:
dataset = Dataset(forge, name="Interesting people")

In [39]:
dataset.add_parts(persons)

In [40]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: edbc5599-cd7c-4825-b97e-97da45bc0b8c_version=1
            type: Person
            distribution: ../../data/scientists-database/marie_curie.txt
            name: Marie Curie
        }
        {
            id: 6571c60b-e0a2-4d80-846e-0f116a44921a_version=1
            type: Person
            distribution: ../../data/scientists-database/albert_einstein.txt
            name: Albert Einstein
        }
    ]
    name: Interesting people
}


In [41]:
forge.register(dataset)

<action> _register_one
<succeeded> True


### advanced

In [42]:
dataframe = pd.read_csv("../../data/associations.tsv", sep="\t")

In [43]:
dataframe

Unnamed: 0,id,name,type,agent__type,agent__name,agent__gender__id,agent__gender__type,agent__gender__label,distribution
0,https://kg.example.ch/associations/123,Curie Association,Association,Person,Marie Curie,http://purl.obolibrary.org/obo/PATO_0000383,LabeledOntologyEntity,female,../../data/scientists-database/marie_curie.txt
1,(missing),Einstein Association,Association,Person,Albert Einstein,http://purl.obolibrary.org/obo/PATO_0000384,LabeledOntologyEntity,male,../../data/scientists-database/albert_einstein...


In [44]:
dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))

In [45]:
associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")

In [46]:
# forge.register(associations)

In [47]:
# dataset = Dataset(forge, name="Interesting associations")

In [48]:
# dataset.add_parts(associations)

In [49]:
# print(dataset)

In [50]:
# forge.register(dataset)