{ "cells": [ { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [], "source": [ "# If installed from pip, import lostruct as ls will work\n", "import lostruct.lostruct as ls\n", "\n", "# PCoA from skbio.stats is the best implementation of R's MDS algorithm\n", "from skbio.stats.ordination import pcoa\n", "\n", "# Much of the output from CyVCF2 and lostruct are numpy arrays\n", "import numpy as np\n", "\n", "import pandas as pd\n", "import plotly.express as px\n", "from sklearn.manifold import MDS\n", "import umap\n", "import hdbscan\n", "import plotly.io as pio\n", "pio.renderers.default = \"notebook_connected\"" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['HM017-I', 'HM018', 'HM022-I', 'HM029', 'HM030']" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Two VCF utility functions are proivded. get_samples() and get_landmarks()\n", "\n", "# This will be the same order of the resulting data\n", "samples = ls.get_samples(\"test_data/chr1-filtered.vcf.gz\")\n", "samples[0:5]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['chl_Mt', 'chr1', 'chr2', 'chr3', 'chr4']" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Utility function: Get list of landmarks (chromosome, scaffolds, contigs, etc..)\n", "landmarks = ls.get_landmarks(\"test_data/chr1-filtered.vcf.gz\")\n", "landmarks[0:5]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Help on function get_samples in module lostruct.lostruct:\n", "\n", "get_samples(vcf_file)\n", " Get the samples from a VCF/BCF file. This is the order the data will remain in as well.\n", "\n" ] } ], "source": [ "# Docstrings are also provided\n", "help(ls.get_samples)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "scrolled": false }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/josephguhlin/anaconda3/envs/lostruct/lib/python3.8/site-packages/numpy/core/_asarray.py:136: VisibleDeprecationWarning:\n", "\n", "Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray\n", "\n" ] } ], "source": [ "# Parse VCF to get windows and positions of each SNP within each window\n", "windows, positions = ls.parse_vcf(\"test_data/chr1-filtered.vcf.gz\", \"chr1\", 95, ls.Window.SNP)\n", "# ls.Window.SNP specifies window sizes are by SNP count. ls.Window.BP specifies windows are in base pair lengths.\n", "\n", "# *** ls.Window.BP is not yet implemented, however. ***\n", "# Please see: https://github.com/jguhlin/lostruct-py/issues/8\n", "\n", "# Accumulate output of eigen_windows\n", "result = list()\n", "for x in windows:\n", " result.append(ls.eigen_windows(x, 10, 1))\n", "\n", "# Convert to numpy array\n", "result = np.vstack(result)\n", "\n", "# Get PCA distances comparison matrix\n", "pc_dists = ls.get_pc_dists(result)\n", "# An additional mode, fastmath, is available. Trading some accuracy for a slight speed boost (~8%)\n", "pc_dists = ls.get_pc_dists(result, fastmath=True)\n", "\n", "# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)\n", "# PLEASE NOTE: See section below: Working with Large Datasets\n", "# For recommended ways to run pcoa\n", "mds = pcoa(pc_dists)\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.scatter(y=mds.samples[\"PC1\"], title=\"MDS Coordinate 1 (y-axis) compared to Window (x-axis)\")" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.scatter(y=mds.samples[\"PC2\"], title=\"MDS Coordinate 2 (y-axis) compared to Window (x-axis)\")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.scatter(x=mds.samples[\"PC1\"], y=mds.samples[\"PC2\"], title=\"MDS Coordinate 1 (x-axis) and MDS Coordinate 2 (y-axis)\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Performing Analaysis Genome-Wide" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "landmarks = ls.get_landmarks(\"test_data/complete_file.vcf.gz\")\n", "\n", "results = list()\n", "snp_positions = list()\n", "\n", "for landmark in landmarks:\n", " windows, positions = ls.parse_vcf(\"test_data/complete_file.vcf.gz\", landmark, 95)\n", " for i, window in enumerate(windows):\n", " results.append(ls.eigen_windows(window, 10, 1))\n", " snp_positions.append(positions[i])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "While the above will not work due to a missing file, it is the appropriate way to get the results for each window for all landmarks (chromosomes, scaffolds, contigs, etc...). Here, we keep track of snp_positions as well, and len(snp_positions) == len(results) so they can be further investigated.\n", "\n", "The code will then remain the same:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Convert to numpy array\n", "results = np.vstack(results)\n", "\n", "# Get PCA distances comparison matrix\n", "pc_dists = ls.get_pc_dists(results)\n", "\n", "# Get PCoA value of pc_dists matrix (this is equivalent to R's MDS)\n", "mds = pcoa(pc_dists)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Comparison to R Version" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.9971509982243155" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mds_coords = pd.read_csv(\"lostruct-results/mds_coords.csv\")\n", "np.corrcoef(mds.samples['PC1'], mds_coords['MDS1'].to_numpy())[0][1]\n", "# R-value is:" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.scatter(x=mds.samples[\"PC1\"], y=mds_coords['MDS1'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Working with Large Datasets" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "-0.9972686515756828" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# PCOA for reduced memory consumption and faster clustering\n", "mds = pcoa(pc_dists, method=\"fsvd\", inplace=True, number_of_dimensions=10)\n", "np.corrcoef(mds.samples[\"PC1\"], mds_coords['MDS1'].to_numpy())[0][1]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.scatter(y=[mds.samples[\"PC1\"], mds_coords['MDS1']], title=\"\")" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "px.scatter(x=mds.samples[\"PC1\"], y=mds_coords['MDS1'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Some looks at other methods of clustering / comparing" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "embedding = MDS(n_components=10, dissimilarity=\"precomputed\", n_jobs=-1, n_init=32)\n", "mds = embedding.fit_transform(pc_dists)\n", "px.scatter(y=[mds[:,0], mds_coords['MDS1']], title=\"Blue is using Python MDS, Red is PCoA method\")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Calculating PHATE...\n", " Running PHATE on precomputed distance matrix with 124 observations.\n", " Calculating graph and diffusion operator...\n", " Calculating affinities...\n", " Calculated affinities in 0.03 seconds.\n", " Calculated graph and diffusion operator in 0.03 seconds.\n", " Calculating optimal t...\n", " Automatically selected t = 12\n", " Calculated optimal t in 0.08 seconds.\n", " Calculating diffusion potential...\n", " Calculated diffusion potential in 0.15 seconds.\n", " Calculating metric MDS...\n", " Calculated metric MDS in 3.48 seconds.\n", "Calculated PHATE in 3.77 seconds.\n" ] } ], "source": [ "import phate\n", "phater = phate.PHATE(n_components=10, knn_dist='precomputed', mds_solver='smacof', mds='metric')\n", "comparison_phate = phater.fit_transform(pc_dists)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "mds = pcoa(pc_dists)\n", "px.scatter(y=[mds_coords['MDS1'], mds.samples[\"PC1\"], comparison_phate[:,0]], title=\"Green is PHATE\")\n", "# https://github.com/KrishnaswamyLab/PHATE\n", "# Moon, van Dijk, Wang, Gigante et al. Visualizing Transitions and Structure for Biological Data Exploration. 2019. Nature Biotechnology." ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "reducer = umap.UMAP()\n", "embedding = reducer.fit_transform(pc_dists)\n", "px.scatter(x=embedding[:, 0], y=embedding[:, 1])\n", "# UMAP: https://umap-learn.readthedocs.io/en/latest/index.html\n", "# McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "hdbscan_labels = hdbscan.HDBSCAN().fit_predict(embedding)\n", "px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=hdbscan_labels)\n", "# hdbscan: https://hdbscan.readthedocs.io/en/latest/index.html" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "reducer = umap.UMAP(n_components=3)\n", "embedding = reducer.fit_transform(pc_dists)\n", "hdbscan_labels = hdbscan.HDBSCAN().fit_predict(embedding)\n", "fig = px.scatter_3d(x=embedding[:, 0], y=embedding[:, 1], z=embedding[:, 2], color=hdbscan_labels, width=800, height=600)\n", "fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# The following is for future features, please disregard\n", "\n", "### Code used to generate the original random weights\n", "**If weights are urgently needed, please add a comment to the existing github issue**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Code originally used to generate and save random weights. Saved and static now to feed into R and compare with lostruct R\n", "weights = np.random.random_sample(len(samples))\n", "weights = weights*10\n", "\n", "outfh = open(\"test_data/random_weights.txt\", \"w\")\n", "\n", "outfh.write(\"{}\\t{}\\n\".format(\"ID\", \"weight\"))\n", "for id,w in zip(samples, weights):\n", " outfh.write(\"{}\\t{:.06f}\\n\".format(id, w))\n", "\n", "del(outfh)\n", "np.save(\"test_data/random_weights.npy\", weights)\n", "weights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "weights = np.load(\"test_data/random_weights.npy\")\n", "weights" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_pc_dists(windows, fastmath=False, w=1):\n", " \"\"\"\n", " Calculate distances between window matrices.\n", "\n", " Works on only the upper triangle of the matrix, but clones the data into the lower half as well.\n", " \"\"\"\n", " n = len(windows)\n", "\n", " vals = ls.l1_norm(np.asarray([x[2] for x in windows]))\n", " vals = vals.real.astype(np.float64)\n", " \n", " vecs = np.asarray([x[3] for x in windows])\n", " weights = w[:, np.newaxis]\n", " #sqrt_w = np.squeeze(np.repeat(np.sqrt(weights), result[0][3].shape[0], axis=1))\n", " sqrt_w = np.squeeze(np.sqrt(weights))\n", " print(sqrt_w.shape)\n", " vecs = np.multiply(vecs, sqrt_w)\n", " #vecs = np.multiply(vecs, sqrt_w.T)\n", " #print(vecs)\n", "\n", " if fastmath:\n", " comparison = ls.calc_dists_fastmath(n, vals, vecs)\n", " else:\n", " comparison = ls.calc_dists(n, vals, vecs)\n", "\n", " # Remove negatives... Can't be placed within Numba code\n", " comparison[comparison < 0] = 0\n", "\n", " # Get square root\n", " return np.sqrt(comparison)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vecs.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#result = list()\n", "#for x in windows:\n", "# result.append(ls.eigen_windows(x, 10, weights))\n", "#result = np.vstack(result)\n", "\n", "pc_dists = get_pc_dists(result, fastmath=True, w=weights)\n", "mds = pcoa(pc_dists)\n", "mds_coords = pd.read_csv(\"lostruct-results/weights_mds_coords.csv\")\n", "print(\"Weights compared to Lostruct R:\")\n", "print(np.corrcoef(mds.samples['PC1'], mds_coords['MDS1'].to_numpy()))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pc_dists" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_pc_dists(windows, fastmath=False, w=1):\n", " \"\"\"\n", " Calculate distances between window matrices.\n", "\n", " Works on only the upper triangle of the matrix, but clones the data into the lower half as well.\n", " \"\"\"\n", " n = len(windows)\n", " \n", " sqrt_w = np.sqrt(w)\n", " vals = np.multiply(x[2], sqrt_w.T)\n", " \n", " vals = ls.l1_norm(np.asarray([vals for x in windows]))\n", " vals = vals.real.astype(np.float64)\n", "\n", " if fastmath:\n", " comparison = calc_dists_fastmath(n, vals, np.asarray([x[3] for x in windows]))\n", " else:\n", " comparison = calc_dists(n, vals, np.asarray([x[3] for x in windows]))\n", "\n", " # Remove negatives... Can't be placed within Numba code\n", " comparison[comparison < 0] = 0\n", "\n", " # Get square root\n", " return np.sqrt(comparison)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 4 }