# 4-CreateDatasets
This tutorial shows a basic template to create a dataset computationally.

In [12]:
from pyspark.sql import Row, SparkSession
from mmtfPyspark.ml import pythonRDDToDataset
from mmtfPyspark.io import mmtfReader

#### Configure Spark Session and Spark Context

In [13]:
spark = SparkSession.builder.appName("4-CreateDatasets").getOrCreate()

## Read a 10% fraction of the sample file
Reading a random fraction of the input file is a good strategy to test some new functionality.

In [14]:
path = "../resources/mmtf_full_sample"
pdb = mmtfReader.read_sequence_file(path, fraction=0.1)

# Creating a dataset in 3 simple steps

## Step 1: calculate properties for a structure and add it to a Row object

In [15]:
def calcProperties(s):
    # s[0] pdb id
    # s[1] mmtf structure record
    return Row(s[0], s[1].num_models, s[1].num_chains, s[1].num_groups, s[1].num_atoms, s[1].num_bonds)

## Step 2: map structures to rows
Here we use a lambda expression to calculate properties.

In [16]:
rows = pdb.map(lambda s: calcProperties(s))

## Step 3: convert RDD of Rows to a dataset

In [17]:
col_names = ["pdbId", "models", "chains", "groups", "atoms", "bonds"]
summary = pythonRDDToDataset.get_dataset(rows, col_names) 
# summary = spark.createDataFrame(rows, col_names) # alternative method, converts int to long

## Done: Show some details about this dataset

In [18]:
summary.columns

['pdbId', 'models', 'chains', 'groups', 'atoms', 'bonds']

In [19]:
summary.printSchema()

root
 |-- pdbId: string (nullable = false)
 |-- models: integer (nullable = false)
 |-- chains: integer (nullable = false)
 |-- groups: integer (nullable = false)
 |-- atoms: integer (nullable = false)
 |-- bonds: integer (nullable = false)



In [20]:
summary.show()

+-----+------+------+------+-----+-----+
|pdbId|models|chains|groups|atoms|bonds|
+-----+------+------+------+-----+-----+
| 1LBU|     1|     3|   443| 1793| 1602|
| 1LC0|     1|     5|   700| 2731| 2358|
| 1LC5|     1|     4|   628| 3056| 2848|
| 1LFP|     1|     2|   593| 2275| 1958|
| 1LFW|     1|     5|  1041| 4238| 3750|
| 1LGH|     1|    68|   512| 5436| 5526|
| 1LH0|     1|     8|   701| 3596| 3375|
| 1LJ8|     1|     3|   930| 4310| 3965|
| 1LKI|     1|     2|   222| 1386| 1364|
| 1LMI|     1|     2|   303| 1139|  989|
| 1LML|     1|     3|   678| 3738| 3616|
| 1LO7|     1|     5|   316| 1375| 1229|
| 1LQ9|     1|     5|   483| 2006| 1794|
| 1LQV|     1|    30|   862| 4048| 3695|
| 1LR0|     1|     5|   251| 1100|  992|
| 1LR5|     1|    16|  1379| 6071| 5531|
| 1LRI|     1|     4|   199|  861|  777|
| 1LRZ|     1|     2|   718| 3631| 3399|
| 1LS1|     1|     6|   577| 5396| 5073|
| 1LTS|     1|    14|  1034| 6271| 6091|
+-----+------+------+------+-----+-----+
only showing top

#### Print statistics for the numerical columns

In [21]:
summary.describe(col_names[1:]).toPandas()

Unnamed: 0,summary,models,chains,groups,atoms,bonds
0,count,9756.0,9756.0,9756.0,9756.0,9756.0
1,mean,1.0003075030750308,8.567343173431734,699.720377203772,3510.4634071340715,3252.99651496515
2,stddev,0.017533967885444,7.177280313219018,437.7390040805013,2140.150369170067,2015.1217534374905
3,min,1.0,1.0,21.0,154.0,144.0
4,max,2.0,91.0,3026.0,9995.0,10077.0


In [22]:
spark.stop()