{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# If installed from pip, import lostruct as ls will work\n", "import lostruct.lostruct as ls\n", "\n", "# PCoA from skbio.stats is the best implementation of R's MDS algorithm\n", "from skbio.stats.ordination import pcoa\n", "\n", "# Much of the output from CyVCF2 and lostruct are numpy arrays\n", "import numpy as np\n", "\n", "import pandas as pd\n", "import plotly.express as px\n", "from sklearn.manifold import MDS\n", "import umap\n", "import hdbscan\n", "import plotly.io as pio\n", "pio.renderers.default = \"notebook_connected\"" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['HM017-I', 'HM018', 'HM022-I', 'HM029', 'HM030']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Two VCF utility functions are proivded. get_samples() and get_landmarks()\n", "\n", "# This will be the same order of the resulting data\n", "samples = ls.get_samples(\"test_data/chr1-filtered.vcf.gz\")\n", "samples[0:5]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['chl_Mt', 'chr1', 'chr2', 'chr3', 'chr4']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Utility function: Get list of landmarks (chromosome, scaffolds, contigs, etc..)\n", "landmarks = ls.get_landmarks(\"test_data/chr1-filtered.vcf.gz\")\n", "landmarks[0:5]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function get_samples in module lostruct.lostruct:\n", "\n", "get_samples(vcf_file)\n", " Get the samples from a VCF/BCF file. This is the order the data will remain in as well.\n", "\n" ] } ], "source": [ "# Docstrings are also provided\n", "help(ls.get_samples)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/josephguhlin/anaconda3/envs/lostruct/lib/python3.8/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning:\n", "\n", "Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", "\n" ] } ], "source": [ "# Parse VCF to get windows and positions of each SNP within each window\n", "windows, positions = ls.parse_vcf(\"test_data/chr1-filtered.vcf.gz\", \"chr1\", 95, ls.Window.SNP)\n", "# ls.Window.SNP specifies window sizes are by SNP count. ls.Window.BP specifies windows are in base pair lengths.\n", "\n", "# *** ls.Window.BP is not yet implemented, however. ***\n", "# Please see: https://github.com/jguhlin/lostruct-py/issues/8\n", "\n", "# Accumulate output of eigen_windows\n", "result = list()\n", "for x in windows:\n", " result.append(ls.eigen_windows(x, 10, 1))\n", "\n", "# Convert to numpy array\n", "result = np.vstack(result)\n", "\n", "# Get PCA distances comparison matrix\n", "pc_dists = ls.get_pc_dists(result)\n", "# An additional mode, fastmath, is available. Trading some accuracy for a slight speed boost (~8%)\n", "pc_dists = ls.get_pc_dists(result, fastmath=True)\n", "\n", "# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)\n", "# PLEASE NOTE: See section below: Working with Large Datasets\n", "# For recommended ways to run pcoa\n", "mds = pcoa(pc_dists)\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "