{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Example code demonstrating use of gpseer API" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook implements model training and cross validation as run by the gpseer command line. It will generate all of the same plots and csv output. The best way to understand what is going on in this notebook is to follow the command-line [tutorial](https://gpseer.readthedocs.io/en/latest/tutorial.html). " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Set up the environment\n", "%matplotlib inline\n", "from gpseer import utils, maximum_likelihood, cross_validate, plot" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Model parameters (mirroring those seen in the command line)\n", "threshold = None # best model, for pfcrt-data, set to 5\n", "spline_order = None # best model, for pfcrt-data, set to 2\n", "spline_smoothness = None # best model, for pfcrt-data, set to 100000\n", "epistasis_order = 1 # usually don't change\n", "alpha = 1 # usually don't change\n", "output_root = \"linear\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Load data into a genotype-phenotype map. To obtain a local copy of\n", "# pfcrt-raw-data.csv, run gpseer fetch-example on the command line. \n", "gpm = utils.read_file_to_gpmap(\"https://github.com/harmslab/gpseer/raw/master/examples/pfcrt-raw-data.csv\")\n", "gpm" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Construct fitting model. \n", "ml_model = utils.construct_model(threshold=threshold,\n", " spline_order=spline_order,\n", " spline_smoothness=spline_smoothness,\n", " epistasis_order=epistasis_order,\n", " alpha=alpha)\n", "# Add genotype phenotype map to the model\n", "ml_model.add_gpm(gpm)\n", "ml_model.fit()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# Make prediction\n", "prediction_df = maximum_likelihood.predict_to_dataframe(ml_model)\n", "prediction_df.to_csv(f\"{output_root}_predictions.csv\")\n", "prediction_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create output summarizing various fit statistics\n", "stats_df, convergence_df = maximum_likelihood.create_stats_output(ml_model)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show fit information spreadsheet\n", "stats_df.to_csv(f\"{output_root}_fit-information.csv\")\n", "stats_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Show convergence spreadsheet\n", "convergence_df.to_csv(f\"{output_root}_convergence.csv\")\n", "convergence_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot the spline\n", "fig, ax = plot.plot_spline(ml_model,prediction_df)\n", "if fig is not None:\n", " fig.savefig(f\"{output_root}_spline-fit.pdf\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot correlation between measured and predicted values\n", "fig, ax = plot.plot_correlation(ml_model,prediction_df)\n", "fig.savefig(f\"{output_root}_correlation-plot.pdf\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot phenotype histograms\n", "fig, ax = plot.plot_histograms(ml_model,prediction_df)\n", "fig.savefig(f\"{output_root}_phenotype-histograms.pdf\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Construct a model for cross validation\n", "cv_model = utils.construct_model(threshold=threshold,\n", " spline_order=spline_order,\n", " spline_smoothness=spline_smoothness,\n", " epistasis_order=epistasis_order,\n", " alpha=alpha)\n", "\n", "# Do the cross-validation run\n", "cv_df = cross_validate.cross_validate_to_dataframe(cv_model,gpm,n_samples=1000,train_fraction=0.8)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Show the cross-validation spreadsheet\n", "cv_df.to_csv(f\"{output_root}_cross-validation-scores.csv\")\n", "cv_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot cross-validation results\n", "fig, ax = plot.plot_test_train(cv_df)\n", "fig.savefig(f\"{output_root}_cross-validation-plot.pdf\")\n", "None" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }