## Example code demonstrating use of gpseer API

This notebook implements model training and cross validation as run by the gpseer command line. It will generate all of the same plots and csv output. The best way to understand what is going on in this notebook is to follow the command-line [tutorial](https://gpseer.readthedocs.io/en/latest/tutorial.html). 

In [None]:
# Set up the environment
%matplotlib inline
from gpseer import utils, maximum_likelihood, cross_validate, plot

In [None]:
# Model parameters (mirroring those seen in the command line)
threshold = None # best model, for pfcrt-data, set to 5
spline_order = None # best model, for pfcrt-data, set to 2
spline_smoothness = None # best model, for pfcrt-data, set to 100000
epistasis_order = 1 # usually don't change
alpha = 1 # usually don't change
output_root = "linear"

In [None]:
# Load data into a genotype-phenotype map. To obtain a local copy of
# pfcrt-raw-data.csv, run gpseer fetch-example on the command line. 
gpm = utils.read_file_to_gpmap("https://github.com/harmslab/gpseer/raw/master/examples/pfcrt-raw-data.csv")
gpm

In [None]:
# Construct fitting model. 
ml_model = utils.construct_model(threshold=threshold,
 spline_order=spline_order,
 spline_smoothness=spline_smoothness,
 epistasis_order=epistasis_order,
 alpha=alpha)
# Add genotype phenotype map to the model
ml_model.add_gpm(gpm)
ml_model.fit()

In [None]:
# Make prediction
prediction_df = maximum_likelihood.predict_to_dataframe(ml_model)
prediction_df.to_csv(f"{output_root}_predictions.csv")
prediction_df

In [None]:
# Create output summarizing various fit statistics
stats_df, convergence_df = maximum_likelihood.create_stats_output(ml_model)

In [None]:
# Show fit information spreadsheet
stats_df.to_csv(f"{output_root}_fit-information.csv")
stats_df

In [None]:
# Show convergence spreadsheet
convergence_df.to_csv(f"{output_root}_convergence.csv")
convergence_df

In [None]:
# Plot the spline
fig, ax = plot.plot_spline(ml_model,prediction_df)
if fig is not None:
 fig.savefig(f"{output_root}_spline-fit.pdf")
None

In [None]:
# Plot correlation between measured and predicted values
fig, ax = plot.plot_correlation(ml_model,prediction_df)
fig.savefig(f"{output_root}_correlation-plot.pdf")
None

In [None]:
# Plot phenotype histograms
fig, ax = plot.plot_histograms(ml_model,prediction_df)
fig.savefig(f"{output_root}_phenotype-histograms.pdf")
None

In [None]:
# Construct a model for cross validation
cv_model = utils.construct_model(threshold=threshold,
 spline_order=spline_order,
 spline_smoothness=spline_smoothness,
 epistasis_order=epistasis_order,
 alpha=alpha)

# Do the cross-validation run
cv_df = cross_validate.cross_validate_to_dataframe(cv_model,gpm,n_samples=1000,train_fraction=0.8)

In [None]:
# Show the cross-validation spreadsheet
cv_df.to_csv(f"{output_root}_cross-validation-scores.csv")
cv_df

In [None]:
# Plot cross-validation results
fig, ax = plot.plot_test_train(cv_df)
fig.savefig(f"{output_root}_cross-validation-plot.pdf")
None