{ "cells": [ { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "# Data pre-processing\n", "## 1. Download dataset\n", "8,640 cells from the melanoma WM989 cell line were sequenced using Drop-seq,\n", "where 32,287 genes were detected ([MELANOMA](https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE99330&format=file&file=GSE99330%5FdropseqHumanDge%2Etxt%2Egz)).\n", "In addition, RNA FISH experiment\n", "of across 7,000-88,000 cells from the same cell line was conducted and 26\n", "genes were detected ([MELANOMA_FISH](https://www.dropbox.com/s/ia9x0iom6dwueix/fishSubset.txt?dl=0))." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import h5py" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "Download melanoma RNA-seq data for imputation." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "melanoma_rnaseq_path = \"E:/DISC/reproducibility/data/MELANOMA/original_data/GSE99330_dropseqHumanDge.txt.gz\"\n", "melanoma_rnaseq_pd = pd.read_csv(melanoma_rnaseq_path, sep=\" \", compression='gzip', index_col=0, skiprows=1)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "pycharm": { "is_executing": false }, "scrolled": true }, "outputs": [ { "data": { "text/plain": " CTCGCGAGTAGC CGGAGGCACTCG GCAAGTCGATAT \\\nA1BG 0 0 0 \nA1BG-AS1 0 0 0 \nA1CF 0 0 0 \nA2M 0 0 0 \nA2M-AS1 0 0 0 \nA2ML1 0 0 0 \nA2ML1-AS1 0 0 0 \nA2MP1 0 0 0 \nA3GALT2 0 0 0 \nA4GALT 0 0 0 \nAAAS 0 0 0 \nAACS 0 0 0 \nAADACL3 0 0 0 \nAADAT 0 0 0 \nAAED1 0 0 0 \nAAGAB 0 0 0 \nAAK1 0 0 0 \nAAMDC 0 0 0 \nAAMP 0 0 0 \nAANAT 0 0 0 \nAAR2 0 0 0 \nAARD 0 0 0 \nAARS 0 0 0 \nAARS2 0 0 0 \nAARSD1 0 0 0 \nAASDH 0 0 0 \nAASDHPPT 0 0 1 \nAASS 0 0 1 \nAATF 0 0 0 \nAATK 0 0 0 \n... ... ... ... \nZSCAN5A 0 0 0 \nZSCAN9 0 0 0 \nZSWIM1 0 0 0 \nZSWIM3 0 0 0 \nZSWIM4 0 0 0 \nZSWIM5 0 0 0 \nZSWIM6 0 0 0 \nZSWIM7 0 0 0 \nZSWIM8 0 0 0 \nZUFSP 0 0 0 \nZW10 0 0 0 \nZWILCH 0 0 0 \nZWINT 0 0 0 \nZXDA 0 0 0 \nZXDB 0 0 0 \nZXDC 0 0 0 \nZYG11AP1 0 0 0 \nZYG11B 0 0 0 \nZYX 0 0 0 \nZZEF1 0 0 0 \nZZZ3 0 0 0 \nbP-21201H5.1 0 0 0 \nbP-21264C1.2 0 0 0 \nbP-2171C21.3 0 0 0 \nbP-2171C21.4 0 0 0 \nbP-2171C21.6 0 0 0 \nbP-2189O9.2 0 0 0 \nbP-2189O9.3 0 0 0 \nchr22-38_28785274-29006793.1 0 0 0 \nyR211F11.2 0 0 0 \n\n GGACAATTTGTA TGACAATTGACC TAAGACTTCCCT \\\nA1BG 0 0 0 \nA1BG-AS1 0 0 0 \nA1CF 0 0 0 \nA2M 0 0 0 \nA2M-AS1 0 0 0 \nA2ML1 0 0 0 \nA2ML1-AS1 0 0 0 \nA2MP1 0 0 0 \nA3GALT2 0 0 0 \nA4GALT 0 0 0 \nAAAS 0 0 0 \nAACS 0 0 0 \nAADACL3 0 0 0 \nAADAT 0 0 0 \nAAED1 0 0 0 \nAAGAB 1 0 0 \nAAK1 0 0 1 \nAAMDC 0 0 0 \nAAMP 0 0 1 \nAANAT 0 0 0 \nAAR2 0 0 0 \nAARD 0 0 0 \nAARS 0 0 0 \nAARS2 0 0 0 \nAARSD1 0 0 0 \nAASDH 0 0 0 \nAASDHPPT 0 0 3 \nAASS 0 0 1 \nAATF 0 0 0 \nAATK 0 0 0 \n... ... ... ... \nZSCAN5A 0 0 0 \nZSCAN9 0 0 0 \nZSWIM1 0 0 0 \nZSWIM3 0 0 0 \nZSWIM4 0 0 0 \nZSWIM5 0 0 0 \nZSWIM6 0 0 0 \nZSWIM7 0 0 1 \nZSWIM8 0 0 0 \nZUFSP 0 0 0 \nZW10 0 0 0 \nZWILCH 0 0 2 \nZWINT 0 0 1 \nZXDA 0 0 0 \nZXDB 0 0 0 \nZXDC 0 0 1 \nZYG11AP1 0 0 0 \nZYG11B 0 0 0 \nZYX 0 0 0 \nZZEF1 0 0 0 \nZZZ3 0 0 1 \nbP-21201H5.1 0 0 0 \nbP-21264C1.2 0 0 0 \nbP-2171C21.3 0 0 0 \nbP-2171C21.4 0 0 0 \nbP-2171C21.6 0 0 0 \nbP-2189O9.2 0 0 0 \nbP-2189O9.3 0 0 0 \nchr22-38_28785274-29006793.1 0 0 0 \nyR211F11.2 0 0 0 \n\n GAGGAAGGACTC GAAACGGACAGA TCGATTGGAGAA \\\nA1BG 0 0 0 \nA1BG-AS1 0 0 0 \nA1CF 0 0 0 \nA2M 0 0 0 \nA2M-AS1 0 0 0 \nA2ML1 0 0 0 \nA2ML1-AS1 0 0 0 \nA2MP1 0 0 0 \nA3GALT2 0 0 0 \nA4GALT 0 0 0 \nAAAS 0 0 0 \nAACS 0 0 0 \nAADACL3 0 0 0 \nAADAT 0 0 0 \nAAED1 0 0 0 \nAAGAB 0 0 0 \nAAK1 0 0 0 \nAAMDC 0 0 0 \nAAMP 1 1 0 \nAANAT 0 0 0 \nAAR2 0 0 0 \nAARD 0 0 0 \nAARS 0 0 0 \nAARS2 0 0 0 \nAARSD1 0 0 0 \nAASDH 0 0 0 \nAASDHPPT 0 0 0 \nAASS 0 0 0 \nAATF 0 0 0 \nAATK 0 0 0 \n... ... ... ... \nZSCAN5A 0 0 0 \nZSCAN9 0 0 0 \nZSWIM1 0 0 0 \nZSWIM3 0 0 0 \nZSWIM4 0 0 0 \nZSWIM5 0 0 0 \nZSWIM6 0 0 0 \nZSWIM7 0 0 2 \nZSWIM8 0 0 0 \nZUFSP 0 0 0 \nZW10 0 0 0 \nZWILCH 0 0 0 \nZWINT 0 0 0 \nZXDA 0 0 0 \nZXDB 0 0 0 \nZXDC 0 0 0 \nZYG11AP1 0 0 0 \nZYG11B 0 0 0 \nZYX 0 0 0 \nZZEF1 0 0 0 \nZZZ3 0 0 0 \nbP-21201H5.1 0 0 0 \nbP-21264C1.2 0 0 0 \nbP-2171C21.3 0 0 0 \nbP-2171C21.4 0 0 0 \nbP-2171C21.6 0 0 0 \nbP-2189O9.2 0 0 0 \nbP-2189O9.3 0 0 0 \nchr22-38_28785274-29006793.1 0 0 0 \nyR211F11.2 0 0 0 \n\n ATCTAGTCCCCA ... AGCCCTGACAAC \\\nA1BG 0 ... 0 \nA1BG-AS1 0 ... 0 \nA1CF 0 ... 0 \nA2M 0 ... 0 \nA2M-AS1 0 ... 0 \nA2ML1 0 ... 0 \nA2ML1-AS1 0 ... 0 \nA2MP1 0 ... 0 \nA3GALT2 0 ... 0 \nA4GALT 0 ... 0 \nAAAS 0 ... 0 \nAACS 0 ... 0 \nAADACL3 0 ... 0 \nAADAT 0 ... 0 \nAAED1 0 ... 0 \nAAGAB 0 ... 0 \nAAK1 0 ... 0 \nAAMDC 0 ... 0 \nAAMP 0 ... 0 \nAANAT 0 ... 0 \nAAR2 0 ... 0 \nAARD 0 ... 0 \nAARS 0 ... 0 \nAARS2 0 ... 0 \nAARSD1 0 ... 0 \nAASDH 0 ... 0 \nAASDHPPT 0 ... 2 \nAASS 0 ... 0 \nAATF 0 ... 0 \nAATK 0 ... 0 \n... ... ... ... \nZSCAN5A 0 ... 0 \nZSCAN9 0 ... 0 \nZSWIM1 0 ... 0 \nZSWIM3 0 ... 0 \nZSWIM4 0 ... 0 \nZSWIM5 0 ... 0 \nZSWIM6 0 ... 0 \nZSWIM7 0 ... 0 \nZSWIM8 0 ... 0 \nZUFSP 0 ... 0 \nZW10 0 ... 0 \nZWILCH 0 ... 0 \nZWINT 0 ... 0 \nZXDA 0 ... 0 \nZXDB 0 ... 0 \nZXDC 0 ... 0 \nZYG11AP1 0 ... 0 \nZYG11B 0 ... 0 \nZYX 0 ... 1 \nZZEF1 0 ... 0 \nZZZ3 0 ... 0 \nbP-21201H5.1 0 ... 0 \nbP-21264C1.2 0 ... 0 \nbP-2171C21.3 0 ... 0 \nbP-2171C21.4 0 ... 0 \nbP-2171C21.6 0 ... 0 \nbP-2189O9.2 0 ... 0 \nbP-2189O9.3 0 ... 0 \nchr22-38_28785274-29006793.1 0 ... 0 \nyR211F11.2 0 ... 0 \n\n ACTCTCGATTCC GGTCAAATAAGA ACCTCCCCTATA \\\nA1BG 0 0 0 \nA1BG-AS1 0 0 0 \nA1CF 0 0 0 \nA2M 0 0 0 \nA2M-AS1 0 0 0 \nA2ML1 0 0 0 \nA2ML1-AS1 0 0 0 \nA2MP1 0 0 0 \nA3GALT2 0 0 0 \nA4GALT 0 0 0 \nAAAS 0 0 0 \nAACS 0 0 0 \nAADACL3 0 0 0 \nAADAT 0 0 0 \nAAED1 0 0 0 \nAAGAB 0 0 0 \nAAK1 0 0 0 \nAAMDC 0 0 0 \nAAMP 0 0 0 \nAANAT 0 0 0 \nAAR2 0 0 0 \nAARD 0 0 0 \nAARS 1 0 0 \nAARS2 0 0 0 \nAARSD1 0 0 0 \nAASDH 0 0 0 \nAASDHPPT 0 0 0 \nAASS 0 0 0 \nAATF 0 0 0 \nAATK 0 0 0 \n... ... ... ... \nZSCAN5A 0 0 0 \nZSCAN9 0 0 0 \nZSWIM1 0 0 0 \nZSWIM3 0 0 0 \nZSWIM4 0 0 0 \nZSWIM5 0 0 0 \nZSWIM6 0 0 0 \nZSWIM7 0 0 0 \nZSWIM8 0 0 0 \nZUFSP 0 0 0 \nZW10 0 0 0 \nZWILCH 0 0 0 \nZWINT 0 0 0 \nZXDA 0 0 0 \nZXDB 0 0 0 \nZXDC 0 0 0 \nZYG11AP1 0 0 0 \nZYG11B 0 0 0 \nZYX 0 0 0 \nZZEF1 0 0 0 \nZZZ3 0 0 0 \nbP-21201H5.1 0 0 0 \nbP-21264C1.2 0 0 0 \nbP-2171C21.3 0 0 0 \nbP-2171C21.4 0 0 0 \nbP-2171C21.6 0 0 0 \nbP-2189O9.2 0 0 0 \nbP-2189O9.3 0 0 0 \nchr22-38_28785274-29006793.1 0 0 0 \nyR211F11.2 0 0 0 \n\n ACCTCCCCTACC CCATTTTTTCCT TAAAGCGTGTAC \\\nA1BG 0 0 0 \nA1BG-AS1 0 0 0 \nA1CF 0 0 0 \nA2M 0 0 0 \nA2M-AS1 0 0 0 \nA2ML1 0 0 0 \nA2ML1-AS1 0 0 0 \nA2MP1 0 0 0 \nA3GALT2 0 0 0 \nA4GALT 0 0 0 \nAAAS 0 0 0 \nAACS 0 0 0 \nAADACL3 0 0 0 \nAADAT 0 0 0 \nAAED1 0 0 0 \nAAGAB 0 0 0 \nAAK1 0 1 0 \nAAMDC 0 0 0 \nAAMP 0 0 0 \nAANAT 0 0 0 \nAAR2 0 0 1 \nAARD 0 0 0 \nAARS 0 0 1 \nAARS2 0 0 0 \nAARSD1 0 0 0 \nAASDH 0 0 0 \nAASDHPPT 0 0 0 \nAASS 0 0 0 \nAATF 0 0 0 \nAATK 0 0 0 \n... ... ... ... \nZSCAN5A 0 0 0 \nZSCAN9 0 0 0 \nZSWIM1 0 0 0 \nZSWIM3 0 0 0 \nZSWIM4 0 0 0 \nZSWIM5 0 0 0 \nZSWIM6 0 0 0 \nZSWIM7 0 0 0 \nZSWIM8 0 0 0 \nZUFSP 0 0 0 \nZW10 0 0 0 \nZWILCH 0 0 0 \nZWINT 0 0 0 \nZXDA 0 0 0 \nZXDB 0 0 0 \nZXDC 0 0 0 \nZYG11AP1 0 0 0 \nZYG11B 0 0 0 \nZYX 0 0 0 \nZZEF1 0 0 1 \nZZZ3 0 0 3 \nbP-21201H5.1 0 0 0 \nbP-21264C1.2 0 0 0 \nbP-2171C21.3 0 0 0 \nbP-2171C21.4 0 0 0 \nbP-2171C21.6 0 0 0 \nbP-2189O9.2 0 0 0 \nbP-2189O9.3 0 0 0 \nchr22-38_28785274-29006793.1 0 0 0 \nyR211F11.2 0 0 0 \n\n GATCAGAAGGTA AGCGAGACGATG ATTCTTGTGTAC \nA1BG 0 0 0 \nA1BG-AS1 0 0 0 \nA1CF 0 0 0 \nA2M 0 0 0 \nA2M-AS1 0 0 0 \nA2ML1 0 0 0 \nA2ML1-AS1 0 0 0 \nA2MP1 0 0 0 \nA3GALT2 0 0 0 \nA4GALT 0 0 0 \nAAAS 0 0 0 \nAACS 0 0 0 \nAADACL3 0 0 0 \nAADAT 0 0 0 \nAAED1 0 0 0 \nAAGAB 0 0 0 \nAAK1 0 0 0 \nAAMDC 0 0 2 \nAAMP 0 0 0 \nAANAT 0 0 0 \nAAR2 0 0 0 \nAARD 0 0 0 \nAARS 0 0 0 \nAARS2 0 0 2 \nAARSD1 0 0 0 \nAASDH 0 0 0 \nAASDHPPT 0 0 0 \nAASS 0 0 0 \nAATF 0 0 0 \nAATK 0 0 0 \n... ... ... ... \nZSCAN5A 0 0 0 \nZSCAN9 0 0 0 \nZSWIM1 0 0 0 \nZSWIM3 0 0 0 \nZSWIM4 0 0 0 \nZSWIM5 0 0 0 \nZSWIM6 0 0 0 \nZSWIM7 0 0 0 \nZSWIM8 0 0 0 \nZUFSP 0 0 0 \nZW10 0 0 0 \nZWILCH 0 1 0 \nZWINT 0 0 0 \nZXDA 0 0 0 \nZXDB 0 0 0 \nZXDC 0 0 0 \nZYG11AP1 0 0 0 \nZYG11B 0 0 0 \nZYX 0 0 0 \nZZEF1 0 0 0 \nZZZ3 0 0 1 \nbP-21201H5.1 0 0 0 \nbP-21264C1.2 0 0 0 \nbP-2171C21.3 0 0 0 \nbP-2171C21.4 0 0 0 \nbP-2171C21.6 0 0 0 \nbP-2189O9.2 0 0 0 \nbP-2189O9.3 0 0 0 \nchr22-38_28785274-29006793.1 0 0 0 \nyR211F11.2 0 0 0 \n\n[32287 rows x 8640 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
CTCGCGAGTAGCCGGAGGCACTCGGCAAGTCGATATGGACAATTTGTATGACAATTGACCTAAGACTTCCCTGAGGAAGGACTCGAAACGGACAGATCGATTGGAGAAATCTAGTCCCCA...AGCCCTGACAACACTCTCGATTCCGGTCAAATAAGAACCTCCCCTATAACCTCCCCTACCCCATTTTTTCCTTAAAGCGTGTACGATCAGAAGGTAAGCGAGACGATGATTCTTGTGTAC
A1BG0000000000...0000000000
A1BG-AS10000000000...0000000000
A1CF0000000000...0000000000
A2M0000000000...0000000000
A2M-AS10000000000...0000000000
A2ML10000000000...0000000000
A2ML1-AS10000000000...0000000000
A2MP10000000000...0000000000
A3GALT20000000000...0000000000
A4GALT0000000000...0000000000
AAAS0000000000...0000000000
AACS0000000000...0000000000
AADACL30000000000...0000000000
AADAT0000000000...0000000000
AAED10000000000...0000000000
AAGAB0001000000...0000000000
AAK10000010000...0000010000
AAMDC0000000000...0000000002
AAMP0000011100...0000000000
AANAT0000000000...0000000000
AAR20000000000...0000001000
AARD0000000000...0000000000
AARS0000000000...0100001000
AARS20000000000...0000000002
AARSD10000000000...0000000000
AASDH0000000000...0000000000
AASDHPPT0010030000...2000000000
AASS0010010000...0000000000
AATF0000000000...0000000000
AATK0000000000...0000000000
..................................................................
ZSCAN5A0000000000...0000000000
ZSCAN90000000000...0000000000
ZSWIM10000000000...0000000000
ZSWIM30000000000...0000000000
ZSWIM40000000000...0000000000
ZSWIM50000000000...0000000000
ZSWIM60000000000...0000000000
ZSWIM70000010020...0000000000
ZSWIM80000000000...0000000000
ZUFSP0000000000...0000000000
ZW100000000000...0000000000
ZWILCH0000020000...0000000010
ZWINT0000010000...0000000000
ZXDA0000000000...0000000000
ZXDB0000000000...0000000000
ZXDC0000010000...0000000000
ZYG11AP10000000000...0000000000
ZYG11B0000000000...0000000000
ZYX0000000000...1000000000
ZZEF10000000000...0000001000
ZZZ30000010000...0000003001
bP-21201H5.10000000000...0000000000
bP-21264C1.20000000000...0000000000
bP-2171C21.30000000000...0000000000
bP-2171C21.40000000000...0000000000
bP-2171C21.60000000000...0000000000
bP-2189O9.20000000000...0000000000
bP-2189O9.30000000000...0000000000
chr22-38_28785274-29006793.10000000000...0000000000
yR211F11.20000000000...0000000000
\n

32287 rows × 8640 columns

\n
" }, "metadata": {}, "output_type": "execute_result", "execution_count": 3 } ], "source": [ "melanoma_rnaseq_pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We download melanoma FISH data for validation." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "melanoma_fish_path = \"E:/DISC/reproducibility/data/MELANOMA/original_data/fishSubset.txt\"\n", "melanoma_fish_pd = pd.read_csv(melanoma_fish_path, sep=\" \", index_col=0).T" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "pycharm": { "is_executing": false }, "scrolled": true }, "outputs": [ { "data": { "text/plain": " fish1.1 fish1.2 fish1.3 fish1.4 fish1.5 fish1.6 fish1.8 \\\nEGFR 0.0 1.0 0.0 0.0 1.0 0.0 1.0 \nSOX10 191.0 115.0 86.0 40.0 74.0 97.0 151.0 \nCCNA2 2.0 2.0 7.0 4.0 4.0 5.0 10.0 \nGAPDH 270.0 241.0 192.0 87.0 149.0 165.0 267.0 \nWNT5A 0.0 0.0 0.0 0.0 1.0 0.0 0.0 \nPDGFRB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nPDGFC 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nSERPINE1 21.0 7.0 4.0 0.0 2.0 6.0 16.0 \nNGFR 4.0 8.0 4.0 2.0 1.0 4.0 5.0 \nNRG1 1.0 5.0 0.0 1.0 2.0 1.0 11.0 \nFOSL1 3.0 3.0 3.0 0.0 0.0 1.0 0.0 \nVEGFC 11.0 19.0 2.0 3.0 4.0 6.0 4.0 \nAXL 3.0 1.0 1.0 0.0 0.0 0.0 1.0 \nMITF 34.0 20.0 10.0 8.0 12.0 14.0 40.0 \nLOXL2 4.0 43.0 8.0 0.0 1.0 7.0 28.0 \nRUNX2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nFGFR1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 \nJUN 0.0 4.0 1.0 0.0 0.0 0.0 1.0 \nVGF 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nBABAM1 NaN NaN NaN NaN NaN NaN NaN \nKDM5A NaN NaN NaN NaN NaN NaN NaN \nLMNA NaN NaN NaN NaN NaN NaN NaN \nKDM5B NaN NaN NaN NaN NaN NaN NaN \nC1S NaN NaN NaN NaN NaN NaN NaN \nVCL NaN NaN NaN NaN NaN NaN NaN \nTXNRD1 NaN NaN NaN NaN NaN NaN NaN \n\n fish1.9 fish1.10 fish1.11 ... fish7.21305 fish7.21306 \\\nEGFR 0.0 2.0 0.0 ... 2.0 0.0 \nSOX10 98.0 97.0 167.0 ... NaN NaN \nCCNA2 7.0 14.0 15.0 ... NaN NaN \nGAPDH 199.0 292.0 225.0 ... 183.0 306.0 \nWNT5A 0.0 0.0 0.0 ... NaN NaN \nPDGFRB 0.0 0.0 0.0 ... NaN NaN \nPDGFC 0.0 0.0 0.0 ... NaN NaN \nSERPINE1 2.0 7.0 12.0 ... NaN NaN \nNGFR 1.0 2.0 3.0 ... NaN NaN \nNRG1 3.0 2.0 6.0 ... NaN NaN \nFOSL1 3.0 0.0 0.0 ... NaN NaN \nVEGFC 4.0 2.0 2.0 ... NaN NaN \nAXL 1.0 0.0 1.0 ... NaN NaN \nMITF 31.0 30.0 14.0 ... NaN NaN \nLOXL2 7.0 8.0 17.0 ... 38.0 0.0 \nRUNX2 0.0 0.0 0.0 ... NaN NaN \nFGFR1 1.0 0.0 0.0 ... NaN NaN \nJUN 1.0 0.0 0.0 ... NaN NaN \nVGF 0.0 0.0 0.0 ... NaN NaN \nBABAM1 NaN NaN NaN ... NaN NaN \nKDM5A NaN NaN NaN ... NaN NaN \nLMNA NaN NaN NaN ... NaN NaN \nKDM5B NaN NaN NaN ... NaN NaN \nC1S NaN NaN NaN ... NaN NaN \nVCL NaN NaN NaN ... NaN NaN \nTXNRD1 NaN NaN NaN ... 40.0 68.0 \n\n fish7.21307 fish7.21308 fish7.21309 fish7.21310 fish7.21311 \\\nEGFR 0.0 0.0 1.0 4.0 0.0 \nSOX10 NaN NaN NaN NaN NaN \nCCNA2 NaN NaN NaN NaN NaN \nGAPDH 142.0 114.0 462.0 1031.0 60.0 \nWNT5A NaN NaN NaN NaN NaN \nPDGFRB NaN NaN NaN NaN NaN \nPDGFC NaN NaN NaN NaN NaN \nSERPINE1 NaN NaN NaN NaN NaN \nNGFR NaN NaN NaN NaN NaN \nNRG1 NaN NaN NaN NaN NaN \nFOSL1 NaN NaN NaN NaN NaN \nVEGFC NaN NaN NaN NaN NaN \nAXL NaN NaN NaN NaN NaN \nMITF NaN NaN NaN NaN NaN \nLOXL2 1.0 2.0 11.0 14.0 1.0 \nRUNX2 NaN NaN NaN NaN NaN \nFGFR1 NaN NaN NaN NaN NaN \nJUN NaN NaN NaN NaN NaN \nVGF NaN NaN NaN NaN NaN \nBABAM1 NaN NaN NaN NaN NaN \nKDM5A NaN NaN NaN NaN NaN \nLMNA NaN NaN NaN NaN NaN \nKDM5B NaN NaN NaN NaN NaN \nC1S NaN NaN NaN NaN NaN \nVCL NaN NaN NaN NaN NaN \nTXNRD1 19.0 12.0 42.0 154.0 9.0 \n\n fish7.21312 fish7.21313 fish7.21314 \nEGFR 0.0 0.0 0.0 \nSOX10 NaN NaN NaN \nCCNA2 NaN NaN NaN \nGAPDH 51.0 105.0 41.0 \nWNT5A NaN NaN NaN \nPDGFRB NaN NaN NaN \nPDGFC NaN NaN NaN \nSERPINE1 NaN NaN NaN \nNGFR NaN NaN NaN \nNRG1 NaN NaN NaN \nFOSL1 NaN NaN NaN \nVEGFC NaN NaN NaN \nAXL NaN NaN NaN \nMITF NaN NaN NaN \nLOXL2 7.0 7.0 1.0 \nRUNX2 NaN NaN NaN \nFGFR1 NaN NaN NaN \nJUN NaN NaN NaN \nVGF NaN NaN NaN \nBABAM1 NaN NaN NaN \nKDM5A NaN NaN NaN \nLMNA NaN NaN NaN \nKDM5B NaN NaN NaN \nC1S NaN NaN NaN \nVCL NaN NaN NaN \nTXNRD1 4.0 5.0 24.0 \n\n[26 rows x 88040 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
fish1.1fish1.2fish1.3fish1.4fish1.5fish1.6fish1.8fish1.9fish1.10fish1.11...fish7.21305fish7.21306fish7.21307fish7.21308fish7.21309fish7.21310fish7.21311fish7.21312fish7.21313fish7.21314
EGFR0.01.00.00.01.00.01.00.02.00.0...2.00.00.00.01.04.00.00.00.00.0
SOX10191.0115.086.040.074.097.0151.098.097.0167.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
CCNA22.02.07.04.04.05.010.07.014.015.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
GAPDH270.0241.0192.087.0149.0165.0267.0199.0292.0225.0...183.0306.0142.0114.0462.01031.060.051.0105.041.0
WNT5A0.00.00.00.01.00.00.00.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
PDGFRB0.00.00.00.00.00.00.00.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
PDGFC0.00.00.00.00.00.00.00.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
SERPINE121.07.04.00.02.06.016.02.07.012.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
NGFR4.08.04.02.01.04.05.01.02.03.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
NRG11.05.00.01.02.01.011.03.02.06.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
FOSL13.03.03.00.00.01.00.03.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
VEGFC11.019.02.03.04.06.04.04.02.02.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
AXL3.01.01.00.00.00.01.01.00.01.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
MITF34.020.010.08.012.014.040.031.030.014.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
LOXL24.043.08.00.01.07.028.07.08.017.0...38.00.01.02.011.014.01.07.07.01.0
RUNX20.00.00.00.00.00.00.00.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
FGFR10.01.00.00.00.00.00.01.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
JUN0.04.01.00.00.00.01.01.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
VGF0.00.00.00.00.00.00.00.00.00.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
BABAM1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
KDM5ANaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
LMNANaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
KDM5BNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
C1SNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
VCLNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
TXNRD1NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...40.068.019.012.042.0154.09.04.05.024.0
\n

26 rows × 88040 columns

\n
" }, "metadata": {}, "output_type": "execute_result", "execution_count": 5 } ], "source": [ "melanoma_fish_pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We fill missing values with -1 as [loom](http://loompy.org/) not support np.nan dtype." ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [ { "data": { "text/plain": " fish1.1 fish1.2 fish1.3 fish1.4 fish1.5 fish1.6 fish1.8 \\\nEGFR 0.0 1.0 0.0 0.0 1.0 0.0 1.0 \nSOX10 191.0 115.0 86.0 40.0 74.0 97.0 151.0 \nCCNA2 2.0 2.0 7.0 4.0 4.0 5.0 10.0 \nGAPDH 270.0 241.0 192.0 87.0 149.0 165.0 267.0 \nWNT5A 0.0 0.0 0.0 0.0 1.0 0.0 0.0 \nPDGFRB 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nPDGFC 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nSERPINE1 21.0 7.0 4.0 0.0 2.0 6.0 16.0 \nNGFR 4.0 8.0 4.0 2.0 1.0 4.0 5.0 \nNRG1 1.0 5.0 0.0 1.0 2.0 1.0 11.0 \nFOSL1 3.0 3.0 3.0 0.0 0.0 1.0 0.0 \nVEGFC 11.0 19.0 2.0 3.0 4.0 6.0 4.0 \nAXL 3.0 1.0 1.0 0.0 0.0 0.0 1.0 \nMITF 34.0 20.0 10.0 8.0 12.0 14.0 40.0 \nLOXL2 4.0 43.0 8.0 0.0 1.0 7.0 28.0 \nRUNX2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nFGFR1 0.0 1.0 0.0 0.0 0.0 0.0 0.0 \nJUN 0.0 4.0 1.0 0.0 0.0 0.0 1.0 \nVGF 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \nBABAM1 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \nKDM5A -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \nLMNA -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \nKDM5B -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \nC1S -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \nVCL -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \nTXNRD1 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 -1.0 \n\n fish1.9 fish1.10 fish1.11 ... fish7.21305 fish7.21306 \\\nEGFR 0.0 2.0 0.0 ... 2.0 0.0 \nSOX10 98.0 97.0 167.0 ... -1.0 -1.0 \nCCNA2 7.0 14.0 15.0 ... -1.0 -1.0 \nGAPDH 199.0 292.0 225.0 ... 183.0 306.0 \nWNT5A 0.0 0.0 0.0 ... -1.0 -1.0 \nPDGFRB 0.0 0.0 0.0 ... -1.0 -1.0 \nPDGFC 0.0 0.0 0.0 ... -1.0 -1.0 \nSERPINE1 2.0 7.0 12.0 ... -1.0 -1.0 \nNGFR 1.0 2.0 3.0 ... -1.0 -1.0 \nNRG1 3.0 2.0 6.0 ... -1.0 -1.0 \nFOSL1 3.0 0.0 0.0 ... -1.0 -1.0 \nVEGFC 4.0 2.0 2.0 ... -1.0 -1.0 \nAXL 1.0 0.0 1.0 ... -1.0 -1.0 \nMITF 31.0 30.0 14.0 ... -1.0 -1.0 \nLOXL2 7.0 8.0 17.0 ... 38.0 0.0 \nRUNX2 0.0 0.0 0.0 ... -1.0 -1.0 \nFGFR1 1.0 0.0 0.0 ... -1.0 -1.0 \nJUN 1.0 0.0 0.0 ... -1.0 -1.0 \nVGF 0.0 0.0 0.0 ... -1.0 -1.0 \nBABAM1 -1.0 -1.0 -1.0 ... -1.0 -1.0 \nKDM5A -1.0 -1.0 -1.0 ... -1.0 -1.0 \nLMNA -1.0 -1.0 -1.0 ... -1.0 -1.0 \nKDM5B -1.0 -1.0 -1.0 ... -1.0 -1.0 \nC1S -1.0 -1.0 -1.0 ... -1.0 -1.0 \nVCL -1.0 -1.0 -1.0 ... -1.0 -1.0 \nTXNRD1 -1.0 -1.0 -1.0 ... 40.0 68.0 \n\n fish7.21307 fish7.21308 fish7.21309 fish7.21310 fish7.21311 \\\nEGFR 0.0 0.0 1.0 4.0 0.0 \nSOX10 -1.0 -1.0 -1.0 -1.0 -1.0 \nCCNA2 -1.0 -1.0 -1.0 -1.0 -1.0 \nGAPDH 142.0 114.0 462.0 1031.0 60.0 \nWNT5A -1.0 -1.0 -1.0 -1.0 -1.0 \nPDGFRB -1.0 -1.0 -1.0 -1.0 -1.0 \nPDGFC -1.0 -1.0 -1.0 -1.0 -1.0 \nSERPINE1 -1.0 -1.0 -1.0 -1.0 -1.0 \nNGFR -1.0 -1.0 -1.0 -1.0 -1.0 \nNRG1 -1.0 -1.0 -1.0 -1.0 -1.0 \nFOSL1 -1.0 -1.0 -1.0 -1.0 -1.0 \nVEGFC -1.0 -1.0 -1.0 -1.0 -1.0 \nAXL -1.0 -1.0 -1.0 -1.0 -1.0 \nMITF -1.0 -1.0 -1.0 -1.0 -1.0 \nLOXL2 1.0 2.0 11.0 14.0 1.0 \nRUNX2 -1.0 -1.0 -1.0 -1.0 -1.0 \nFGFR1 -1.0 -1.0 -1.0 -1.0 -1.0 \nJUN -1.0 -1.0 -1.0 -1.0 -1.0 \nVGF -1.0 -1.0 -1.0 -1.0 -1.0 \nBABAM1 -1.0 -1.0 -1.0 -1.0 -1.0 \nKDM5A -1.0 -1.0 -1.0 -1.0 -1.0 \nLMNA -1.0 -1.0 -1.0 -1.0 -1.0 \nKDM5B -1.0 -1.0 -1.0 -1.0 -1.0 \nC1S -1.0 -1.0 -1.0 -1.0 -1.0 \nVCL -1.0 -1.0 -1.0 -1.0 -1.0 \nTXNRD1 19.0 12.0 42.0 154.0 9.0 \n\n fish7.21312 fish7.21313 fish7.21314 \nEGFR 0.0 0.0 0.0 \nSOX10 -1.0 -1.0 -1.0 \nCCNA2 -1.0 -1.0 -1.0 \nGAPDH 51.0 105.0 41.0 \nWNT5A -1.0 -1.0 -1.0 \nPDGFRB -1.0 -1.0 -1.0 \nPDGFC -1.0 -1.0 -1.0 \nSERPINE1 -1.0 -1.0 -1.0 \nNGFR -1.0 -1.0 -1.0 \nNRG1 -1.0 -1.0 -1.0 \nFOSL1 -1.0 -1.0 -1.0 \nVEGFC -1.0 -1.0 -1.0 \nAXL -1.0 -1.0 -1.0 \nMITF -1.0 -1.0 -1.0 \nLOXL2 7.0 7.0 1.0 \nRUNX2 -1.0 -1.0 -1.0 \nFGFR1 -1.0 -1.0 -1.0 \nJUN -1.0 -1.0 -1.0 \nVGF -1.0 -1.0 -1.0 \nBABAM1 -1.0 -1.0 -1.0 \nKDM5A -1.0 -1.0 -1.0 \nLMNA -1.0 -1.0 -1.0 \nKDM5B -1.0 -1.0 -1.0 \nC1S -1.0 -1.0 -1.0 \nVCL -1.0 -1.0 -1.0 \nTXNRD1 4.0 5.0 24.0 \n\n[26 rows x 88040 columns]", "text/html": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
fish1.1fish1.2fish1.3fish1.4fish1.5fish1.6fish1.8fish1.9fish1.10fish1.11...fish7.21305fish7.21306fish7.21307fish7.21308fish7.21309fish7.21310fish7.21311fish7.21312fish7.21313fish7.21314
EGFR0.01.00.00.01.00.01.00.02.00.0...2.00.00.00.01.04.00.00.00.00.0
SOX10191.0115.086.040.074.097.0151.098.097.0167.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
CCNA22.02.07.04.04.05.010.07.014.015.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
GAPDH270.0241.0192.087.0149.0165.0267.0199.0292.0225.0...183.0306.0142.0114.0462.01031.060.051.0105.041.0
WNT5A0.00.00.00.01.00.00.00.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
PDGFRB0.00.00.00.00.00.00.00.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
PDGFC0.00.00.00.00.00.00.00.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
SERPINE121.07.04.00.02.06.016.02.07.012.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
NGFR4.08.04.02.01.04.05.01.02.03.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
NRG11.05.00.01.02.01.011.03.02.06.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
FOSL13.03.03.00.00.01.00.03.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
VEGFC11.019.02.03.04.06.04.04.02.02.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
AXL3.01.01.00.00.00.01.01.00.01.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
MITF34.020.010.08.012.014.040.031.030.014.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
LOXL24.043.08.00.01.07.028.07.08.017.0...38.00.01.02.011.014.01.07.07.01.0
RUNX20.00.00.00.00.00.00.00.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
FGFR10.01.00.00.00.00.00.01.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
JUN0.04.01.00.00.00.01.01.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
VGF0.00.00.00.00.00.00.00.00.00.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
BABAM1-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
KDM5A-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
LMNA-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
KDM5B-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
C1S-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
VCL-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0
TXNRD1-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0-1.0...40.068.019.012.042.0154.09.04.05.024.0
\n

26 rows × 88040 columns

\n
" }, "metadata": {}, "output_type": "execute_result", "execution_count": 6 } ], "source": [ "melanoma_fish_pd = melanoma_fish_pd.fillna(-1)\n", "melanoma_fish_pd" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "## 2. Format transformation and cell filtering" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "`DISC` uses [loom](http://loompy.org/) as its I/O format so we save these data as loom-formatted files." ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "with h5py.File(\"E:/DISC/reproducibility/data/MELANOMA/original.loom\", \"w\") as out_f:\n", " out_f.create_group(\"row_graphs\")\n", " out_f.create_group(\"col_graphs\")\n", " out_f.create_group(\"layers\")\n", " out_f[\"row_attrs/Gene\"] = melanoma_rnaseq_pd.index.values.astype(np.string_)\n", " out_f[\"col_attrs/CellID\"] = melanoma_rnaseq_pd.columns.values.astype(np.string_)\n", " out_f.create_dataset(\"matrix\", shape=melanoma_rnaseq_pd.shape,\n", " chunks=(melanoma_rnaseq_pd.shape[0], 1), dtype=np.float32, fletcher32=False,\n", " compression=\"gzip\", shuffle=False, compression_opts=2)\n", " out_f[\"matrix\"][...] = melanoma_rnaseq_pd.values" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "pycharm": { "is_executing": false, "name": "#%%\n" } }, "outputs": [], "source": [ "with h5py.File(\"E:/DISC/reproducibility/data/MELANOMA/fish.loom\", \"w\") as out_f:\n", " out_f.create_group(\"row_graphs\")\n", " out_f.create_group(\"col_graphs\")\n", " out_f.create_group(\"layers\")\n", " out_f[\"row_attrs/Gene\"] = melanoma_fish_pd.index.values.astype(np.string_)\n", " out_f[\"col_attrs/CellID\"] = melanoma_fish_pd.columns.values.astype(np.string_)\n", " out_f.create_dataset(\"matrix\", shape=melanoma_fish_pd.shape,\n", " chunks=(melanoma_fish_pd.shape[0], 1), dtype=np.float32, fletcher32=False,\n", " compression=\"gzip\", shuffle=False, compression_opts=2)\n", " out_f[\"matrix\"][...] = melanoma_fish_pd.values" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "We remove cells with library size less than 500 or greater than 20,000 for RNA-seq data as [SAVER](https://www.nature.com/articles/s41592-018-0033-z). does." ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "pycharm": { "is_executing": false } }, "outputs": [], "source": [ "with h5py.File(\"E:/DISC/reproducibility/data/MELANOMA/raw.loom\", \"w\") as out_f:\n", " with h5py.File(\"E:/DISC/reproducibility/data/MELANOMA/original.loom\", \"r\", libver='latest', swmr=True) as f:\n", " gene_bc_mat = f[\"matrix\"][...]\n", " gene_name = f[\"row_attrs/Gene\"][...]\n", " cell_id = f[\"col_attrs/CellID\"][...]\n", " out_f.create_group(\"row_graphs\")\n", " out_f.create_group(\"col_graphs\")\n", " out_f.create_group(\"layers\")\n", " out_f[\"row_attrs/Gene\"] = gene_name\n", " cell_filter = np.bitwise_and(gene_bc_mat.sum(0) >= 500, gene_bc_mat.sum(0) <= 20000)\n", " out_f[\"col_attrs/CellID\"] = cell_id[cell_filter]\n", " gene_bc_filt = gene_bc_mat[:, cell_filter]\n", " out_f.create_dataset(\"matrix\", shape=gene_bc_filt.shape,\n", " chunks=(gene_bc_filt.shape[0], 1), dtype=np.float32, fletcher32=False,\n", " compression=\"gzip\", shuffle=False, compression_opts=2)\n", " out_f[\"matrix\"][...] = gene_bc_filt" ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "We will use `raw.loom`(RNA-seq) for imputation and `fish.loom`(FISH) for evaluation." ] }, { "cell_type": "markdown", "metadata": { "pycharm": { "name": "#%% md\n" } }, "source": [ "Reference: \n", "\n", "1. Huang, M. et al. SAVER: gene expression recovery for single-cell RNA sequencing. Nature methods 15, 539–542 (2018)." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.2" }, "pycharm": { "stem_cell": { "cell_type": "raw", "source": [], "metadata": { "collapsed": false } } } }, "nbformat": 4, "nbformat_minor": 1 }