{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### identify the files names for pulling from NCBI FTP SRA metadata dump" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "from ftplib import FTP\n", "bdir='/cellar/users/btsui/Data/SRA/DUMP/'\n", "untaredDir='/nrnb/users/btsui/tmp/SRA_META/'\n", "\n", "ftpLink='ftp.ncbi.nlm.nih.gov'\n", "myRemoteDir='sra/reports/Metadata/'\n", "ftp = FTP(ftpLink)\n", "ftp.login()\n", "ftp.cwd(myRemoteDir)\n", "fnames=pd.Series(ftp.nlst())\n", "#sort the data numerically\n", "myFullSraMeta=fnames[fnames.str.contains('NCBI_SRA_Metadata_Full')\n", " ].sort_values().iloc[-1]\n", "myDownloadFnames=['SRA_Accessions.tab',myFullSraMeta,\n", " 'SRA_Run_Members.tab']" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "#!ls -lah /cellar/users/btsui/Data/SRA/DUMP/" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Files to be downloaded from NCBI: ['SRA_Accessions.tab', 'NCBI_SRA_Metadata_Full_20181005.tar.gz', 'SRA_Run_Members.tab']\n" ] } ], "source": [ "print ('Files to be downloaded from NCBI:',myDownloadFnames)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### remove the old NCBI tar" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "fnames_in_base_dir=pd.Series(os.listdir(bdir))\n", "existing_sra_tar=fnames_in_base_dir[fnames_in_base_dir.str.contains('NCBI_SRA_Metadata.*\\.tar\\.gz$')].iloc[0]\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "#existing_sra_tar" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "#remove existing NCBI_SRA_Metadata tar file\n", "cmd_rm_tar='rm '+bdir+existing_sra_tar\n", "os.system(cmd_rm_tar)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### pull all the data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.93 s, sys: 13.5 s, total: 15.5 s\n", "Wall time: 5min 36s\n" ] } ], "source": [ "%%time\n", "for f in myDownloadFnames:\n", " fileDir = bdir+f\n", " File=open(fileDir,'wb')\n", " ###reopen ftp everytime to avoid idling \n", " ftp = FTP(ftpLink)\n", " ftp.login()\n", " ftp.cwd(myRemoteDir)\n", " ftp.retrbinary('RETR %s' % f, File.write)\n", " File.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### find files in the tar" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "#!ls /nrnb/users/btsui/tmp/SRA_META/" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "256" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "os.system('mkdir '+untaredDir)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%%time\n", "\"\"\"\n", "last untaring time 28m57.656s\n", "\"\"\"\n", "tarCmd='time tar --skip-old-files -xvf {inDir} -C {out_dir}'.format(inDir=bdir+myFullSraMeta,\n", " out_dir=untaredDir)\n", "os.system(tarCmd)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "exit(0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# scratch" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 15G\r\n", "drwxr-xr-x 6 btsui users 512 Jan 4 2018 .\r\n", "drwxr-xr-x 6 btsui CarterGeneral 512 Aug 10 2016 ..\r\n", "drwxr-xr-x 11 btsui users 128K Oct 2 2015 METAMAP\r\n", "-rw-r--r-- 1 btsui users 15G Jul 4 2016 NCBI_SRA_Metadata_Full_20160702_Run_2.tar\r\n", "drwxr-xr-x 1046455 btsui users 62M Oct 12 16:17 SRA_META\r\n", "drwxr-xr-x 3 btsui users 512 Aug 31 2015 TSCC\r\n", "drwxr-xr-x 2 btsui users 128K Aug 12 2015 bioSample\r\n" ] } ], "source": [ "!ls -lah /nrnb/users/btsui/tmp/" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python [default]", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 1 }