{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TODAY'S DATE:\n", "Tue Dec 3 20:42:13 PST 2019\n", "------------\n", "\n", "Distributor ID:\tUbuntu\n", "Description:\tUbuntu 16.04.6 LTS\n", "Release:\t16.04\n", "Codename:\txenial\n", "\n", "------------\n", "HOSTNAME: \n", "swoose\n", "\n", "------------\n", "Computer Specs:\n", "\n", "Architecture: x86_64\n", "CPU op-mode(s): 32-bit, 64-bit\n", "Byte Order: Little Endian\n", "CPU(s): 24\n", "On-line CPU(s) list: 0-23\n", "Thread(s) per core: 2\n", "Core(s) per socket: 6\n", "Socket(s): 2\n", "NUMA node(s): 1\n", "Vendor ID: GenuineIntel\n", "CPU family: 6\n", "Model: 44\n", "Model name: Intel(R) Xeon(R) CPU X5670 @ 2.93GHz\n", "Stepping: 2\n", "CPU MHz: 2926.194\n", "BogoMIPS: 5852.50\n", "Virtualization: VT-x\n", "L1d cache: 32K\n", "L1i cache: 32K\n", "L2 cache: 256K\n", "L3 cache: 12288K\n", "NUMA node0 CPU(s): 0-23\n", "Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16 xtpr pdcm pcid dca sse4_1 sse4_2 popcnt aes lahf_lm epb ssbd ibrs ibpb stibp kaiser tpr_shadow vnmi flexpriority ept vpid dtherm ida arat flush_l1d\n", "\n", "------------\n", "\n", "Memory Specs\n", "\n", " total used free shared buff/cache available\n", "Mem: 70G 4.4G 50G 559M 15G 65G\n", "Swap: 4.7G 0B 4.7G\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "No LSB modules are available.\n" ] } ], "source": [ "%%bash\n", "echo \"TODAY'S DATE:\"\n", "date\n", "echo \"------------\"\n", "echo \"\"\n", "#Display operating system info\n", "lsb_release -a\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"HOSTNAME: \"; hostname \n", "echo \"\"\n", "echo \"------------\"\n", "echo \"Computer Specs:\"\n", "echo \"\"\n", "lscpu\n", "echo \"\"\n", "echo \"------------\"\n", "echo \"\"\n", "echo \"Memory Specs\"\n", "echo \"\"\n", "free -mh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Set variables\n", "\n", "`%env` are best for bash" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "env: wd=/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts\n", "env: downloads_list=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3 Pgenerosa_v074.fa.fai\n", "env: old_fa_index=Pgenerosa_v074.fa.fai\n", "env: org_merged_gff=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3\n", "env: new_merged_gff=Panopea-generosa-v1.0.a4.gff3\n", "env: new_file=Panopea-generosa-v1.0.a4\n", "env: new_repeats_file=Panopea-generosa-v1.0.a4.repeats\n", "env: repeats_gff=Panopea-generosa-v1.0.a4.repeat_region.gff3\n" ] } ], "source": [ "# Set workding directory\n", "%env wd=/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts\n", "wd=\"/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts\"\n", "\n", "# File download\n", "%env downloads_list=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3 Pgenerosa_v074.fa.fai\n", "\n", "\n", "# Input/output files\n", "%env old_fa_index=Pgenerosa_v074.fa.fai\n", "%env org_merged_gff=Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3\n", "%env new_merged_gff=Panopea-generosa-v1.0.a4.gff3\n", "%env new_file=Panopea-generosa-v1.0.a4\n", "%env new_repeats_file=Panopea-generosa-v1.0.a4.repeats\n", "%env repeats_gff=Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "\n", "\n", "# Set lists of column header names\n", "gff_header = ['seqid','source','type','start','end','score','strand','phase','attributes']\n", "\n", "# Set genome size to 942Mbp\n", "GENOME_SIZE = 942000000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import Python modules" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import fnmatch\n", "import os\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Function to calculate percentage of genome comprised of a given feature\n", "def ind_repeats_percent(feature_length_sum): \n", " return round(float(feature_length_sum / GENOME_SIZE * 100), 2)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "%%bash\n", "mkdir --parents ${wd}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/sam/analyses/20191203_pgen_v074.a4_genome_feature_counts\n" ] } ], "source": [ "cd {wd}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Download files" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "receiving incremental file list\n", "Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3\n", "\n", "sent 30 bytes received 580,544,162 bytes 20,369,971.65 bytes/sec\n", "total size is 580,473,160 speedup is 1.00\n", "receiving incremental file list\n", "\n", "sent 11 bytes received 65 bytes 50.67 bytes/sec\n", "total size is 1,230 speedup is 16.18\n", "total 554M\n", "-rw-rw-r-- 1 sam users 554M Dec 3 14:06 Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3\n", "-rw-rw-rw- 1 sam users 1.3K Jun 26 08:54 Pgenerosa_v074.fa.fai\n" ] } ], "source": [ "%%bash\n", "mapfile -t downloads_array < <(echo ${downloads_list} | tr \" \" \"\\n\")\n", "\n", "for file in \"${downloads_array[@]}\"\n", "do\n", " rsync --archive --verbose \\\n", " owl:/volume1/web/halfshell/genomic-databank/${file} .\n", "done\n", "\n", "ls -lh\n", "\n", "## If need to download via wget, uncomment and run lines below:\n", "#for file in \"${downloads_array[@]}\"\n", "#do\n", "# wget --quiet --no-directories --no-check-certificate https://owl.fish.washington.edu/halfshell/genomic-databank/\"${file}\"\n", "#done" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Rename scaffolds" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "List of scaffold names in Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3:\n", "--------------------\n", "\n", " 1 ##Generated\n", " 1 ##gff-version\n", " 130305 PGA_scaffold10__49_contigs__length_53961475\n", " 118051 PGA_scaffold11__79_contigs__length_51449921\n", " 117911 PGA_scaffold12__71_contigs__length_50438331\n", " 109339 PGA_scaffold13__52_contigs__length_44396874\n", " 115751 PGA_scaffold14__91_contigs__length_45393038\n", " 111166 PGA_scaffold15__101_contigs__length_47938513\n", " 74528 PGA_scaffold16__33_contigs__length_31980953\n", " 84154 PGA_scaffold17__51_contigs__length_34923512\n", " 207214 PGA_scaffold1__77_contigs__length_89643857\n", " 115737 PGA_scaffold18__69_contigs__length_27737463\n", " 160589 PGA_scaffold2__36_contigs__length_69596280\n", " 136262 PGA_scaffold3__111_contigs__length_57743597\n", " 145359 PGA_scaffold4__129_contigs__length_65288255\n", " 150064 PGA_scaffold5__109_contigs__length_67248332\n", " 138842 PGA_scaffold6__104_contigs__length_61759565\n", " 101575 PGA_scaffold7__69_contigs__length_43120122\n", " 138958 PGA_scaffold8__63_contigs__length_61151155\n", " 84829 PGA_scaffold9__45_contigs__length_38581958\n", " 1 ##Project\n", "\n", "--------------------\n", "\n", "List of scaffold names in Panopea-generosa-v1.0.a4.gff3:\n", "--------------------\n", "\n", " 1 ##Generated\n", " 1 ##gff-version\n", " 1 ##Project\n", " 207214 Scaffold_01\n", " 160589 Scaffold_02\n", " 136262 Scaffold_03\n", " 145359 Scaffold_04\n", " 150064 Scaffold_05\n", " 138842 Scaffold_06\n", " 101575 Scaffold_07\n", " 138958 Scaffold_08\n", " 84829 Scaffold_09\n", " 130305 Scaffold_10\n", " 118051 Scaffold_11\n", " 117911 Scaffold_12\n", " 109339 Scaffold_13\n", " 115751 Scaffold_14\n", " 111166 Scaffold_15\n", " 74528 Scaffold_16\n", " 84154 Scaffold_17\n", " 115737 Scaffold_18\n", "\n" ] } ], "source": [ "%%bash\n", "# Array of old scaffold names\n", "# Uses first column of FastA index file to get scaffold names\n", "mapfile -t orig_scaffold_names < <(awk '{print $1}' \"${old_fa_index}\")\n", "\n", "# Array of new scaffold names\n", "mapfile -t new_scaffold_names < <(for number in {01..18}; do echo \"Scaffold_${number}\"; done)\n", "\n", "# sed substitution\n", "# creates sed script to find original scaffold names and replace them with new scafold names\n", "# and passes to sed via stdin\n", "for index in \"${!orig_scaffold_names[@]}\"\n", "do\n", " printf \"s/%s/%s/\\n\" \"${orig_scaffold_names[index]}\" \"${new_scaffold_names[index]}\"\n", "done | sed --file - \"${org_merged_gff}\" \\\n", ">> \"${new_merged_gff}\"\n", "\n", "# Check that substituion worked\n", "echo \"\"\n", "echo \"List of scaffold names in ${org_merged_gff}:\"\n", "echo \"--------------------\"\n", "echo \"\"\n", "awk '{print $1}' \"${org_merged_gff}\" | sort | uniq -c\n", "echo \"\"\n", "echo \"--------------------\"\n", "echo \"\"\n", "echo \"List of scaffold names in ${new_merged_gff}:\"\n", "echo \"--------------------\"\n", "echo \"\"\n", "awk '{print $1}' \"${new_merged_gff}\" | sort | uniq -c\n", "echo \"\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Separate features" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Here are the features and their counts in Panopea-generosa-v1.0.a4.gff3:\n", " 236960 CDS\n", " 236960 exon\n", " 34947 gene\n", " 38326 mRNA\n", "1676544 repeat_region\n", " 8 rRNA\n", " 16889 tRNA\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.CDS.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t2\t125\t.\t+\t0\tID=PGEN_.00g000010.m01.CDS01;Name=PGEN_.00g000010.m01.CDS01;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tCDS\t1995\t2095\t.\t+\t1\tID=PGEN_.00g000010.m01.CDS02;Name=PGEN_.00g000010.m01.CDS02;Parent=PGEN_.00g000010.m01;original_ID=cds.21510-PGEN_.00g234140.m01;Alias=cds.21510-PGEN_.00g234140.m01\n", "\n", "\n", "Panopea-generosa-v1.0.a4.exon.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t2\t125\t.\t+\t.\tID=PGEN_.00g000010.m01.exon01;Name=PGEN_.00g000010.m01.exon01;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon1;Alias=21510-PGEN_.00g234140.m01.exon1\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\texon\t1995\t2095\t.\t+\t.\tID=PGEN_.00g000010.m01.exon02;Name=PGEN_.00g000010.m01.exon02;Parent=PGEN_.00g000010.m01;original_ID=21510-PGEN_.00g234140.m01.exon2;Alias=21510-PGEN_.00g234140.m01.exon2\n", "\n", "\n", "Panopea-generosa-v1.0.a4.gene.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t2\t4719\t.\t+\t.\tID=PGEN_.00g000010;Name=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140;Alias=21510-PGEN_.00g234140;original_name=21510-PGEN_.00g234140;Notes=sp|Q86IC9|CAMT1_DICDI [BLAST protein vs protein (blastp) 2.7.1],PF01596.12 [Pfam 1.6]\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tgene\t19808\t36739\t.\t-\t.\tID=PGEN_.00g000020;Name=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150;Alias=21510-PGEN_.00g234150;original_name=21510-PGEN_.00g234150;Notes=sp|P04177|TY3H_RAT [BLAST protein vs protein (blastp) 2.7.1],sp|P04177|TY3H_RAT [DIAMOND Functional 0.9.22],IPR036951 [InterProScan 5.29-68.0],PF00351.16 [Pfam 1.6]\n", "\n", "\n", "Panopea-generosa-v1.0.a4.mRNA.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tmRNA\t2\t4719\t.\t+\t.\tID=PGEN_.00g000010.m01;Name=PGEN_.00g000010.m01;Parent=PGEN_.00g000010;original_ID=21510-PGEN_.00g234140.m01;Alias=21510-PGEN_.00g234140.m01;original_name=21510-PGEN_.00g234140\n", "Scaffold_01\tGenSAS_5d9637f372b5d-publish\tmRNA\t19808\t36739\t.\t-\t.\tID=PGEN_.00g000020.m01;Name=PGEN_.00g000020.m01;Parent=PGEN_.00g000020;original_ID=21510-PGEN_.00g234150.m01;Alias=21510-PGEN_.00g234150.m01;original_name=21510-PGEN_.00g234150\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t265\t338\t334\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000002;Name=19535.GS22252505.PGEN_.repeat00000002;repeat_match=rnd-5_family-367;repeat_class=Unknown;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t855\t1050\t451\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000003;Name=19535.GS22252505.PGEN_.repeat00000003;repeat_match=rnd-5_family-1818;repeat_class=Unknown;\n", "\n", "\n", "Panopea-generosa-v1.0.a4.rRNA.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tRNAmmer-1.2\trRNA\t46743424\t46743531\t48.2\t-\t.\tID=21514.GS22252505.PGEN_.rRNA00000007;Name=21514.GS22252505.PGEN_.rRNA00000007;molecule_type=8s_rRNA;\n", "Scaffold_02\tRNAmmer-1.2\trRNA\t69261970\t69262107\t51.4\t+\t.\tID=21514.GS22252506.PGEN_.rRNA00000003;Name=21514.GS22252506.PGEN_.rRNA00000003;molecule_type=8s_rRNA;\n", "\n", "\n", "Panopea-generosa-v1.0.a4.tRNA.gff3\n", "----------------------------------------------\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d82b316cd298-trnascan\ttRNA\t8317\t8393\t34.9\t.\t.\tID=21513.GS22252505.PGEN_.tRNA00000001;Name=Ala;anti_codon=CGC;gene_no=1;\n", "Scaffold_01\tGenSAS_5d82b316cd298-trnascan\ttRNA\t87519\t87602\t34.9\t.\t.\tID=21513.GS22252505.PGEN_.tRNA00000002;Name=Thr;anti_codon=AGT;gene_no=2;\n" ] } ], "source": [ "%%bash\n", "\n", "echo \"Here are the features and their counts in ${new_merged_gff}:\"\n", "awk 'NR>3 {print $3}' \"${new_merged_gff}\" | sort | uniq -c\n", "echo \"\"\n", "\n", "# Create array for feature names\n", "mapfile -t features_array < <(awk 'NR>3 {print $3}' \"${new_merged_gff}\" | sort | uniq)\n", "\n", "\n", "# Save features in array\n", "for feature in \"${features_array[@]}\"\n", "do\n", " output=\"${new_file}.${feature}.gff3\"\n", " head -n 3 \"${new_merged_gff}\" >> \"${output}\"\n", " awk -v feature=\"$feature\" '$3 == feature {print}' \"${new_merged_gff}\" \\\n", " >> \"${output}\"\n", " echo \"\"\n", " echo \"\"\n", " echo \"${output}\"\n", " echo \"----------------------------------------------\"\n", " head -n 5 \"${output}\"\n", "done\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Feature stats" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.gene.gff3\n", "-------------------------\n", "mean 10811.04461\n", "min 166.00000\n", "median 4464.00000\n", "max 283066.00000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.mRNA.gff3\n", "-------------------------\n", "mean 12903.649559\n", "min 166.000000\n", "median 5453.000000\n", "max 283066.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.tRNA.gff3\n", "-------------------------\n", "mean 74.807745\n", "min 53.000000\n", "median 75.000000\n", "max 314.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.rRNA.gff3\n", "-------------------------\n", "mean 117.125\n", "min 108.000\n", "median 115.000\n", "max 138.000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "-------------------------\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/home/sam/programs/minicocnda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3049: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", " interactivity=interactivity, compiler=compiler, result=result)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "mean 212.244974\n", "min 6.000000\n", "median 149.000000\n", "max 10981.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.exon.gff3\n", "-------------------------\n", "mean 201.476988\n", "min 3.000000\n", "median 133.000000\n", "max 13221.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.gff3\n", "-------------------------\n", "mean 591.326038\n", "min 3.000000\n", "median 148.000000\n", "max 283066.000000\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.CDS.gff3\n", "-------------------------\n", "mean 201.476988\n", "min 3.000000\n", "median 133.000000\n", "max 13221.000000\n", "Name: seqlength, dtype: float64\n" ] } ], "source": [ "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, 'Panopea-generosa-v1.0.a4*.gff3'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " \n", " # Import GFF.\n", " # Skip first 3 rows (gff header lines) and indicate file is tab-separated\n", " gff=pandas.read_csv(file, header=None, skiprows=3, sep=\"\\t\")\n", " \n", " # Rename columns\n", " gff.columns = gff_header\n", " \n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero (i.e. adjust for 1-based counting system)\n", " gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " \n", " # Apply functions in list to seqlength column\n", " gff_stats = gff['seqlength'].agg(['mean', 'min', 'median', 'max'])\n", " \n", " print (gff_stats)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Separate repeats" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique repeats features in Panopea-generosa-v1.0.a4.repeat_region.gff3:\n", "DNA\n", "LINE\n", "LTR\n", "RC\n", "Simple_repeat\n", "SINE\n", "Unknown\n", "\n", "----------------------------------------------\n", "\n", "Parsing DNA from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "Parsing matching feature lines for DNA feature...\n", "Done with parsing DNA feature.\n", "Identified 23195 DNA features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t11381\t11673\t2176\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000029;Name=19535.GS22252505.PGEN_.repeat00000029;repeat_match=rnd-1_family-791;repeat_class=DNA%2FTcMar-Mariner;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t29272\t29508\t341\t-\t.\tID=19535.GS22252505.PGEN_.repeat00000058;Name=19535.GS22252505.PGEN_.repeat00000058;repeat_match=rnd-1_family-443;repeat_class=DNA%2FTcMar-Mariner;\n", "\n", "----------------------------------------------\n", "Parsing LINE from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "Parsing matching feature lines for LINE feature...\n", "Done with parsing LINE feature.\n", "Identified 75939 LINE features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t5885\t6044\t316\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000014;Name=19535.GS22252505.PGEN_.repeat00000014;repeat_match=rnd-5_family-963;repeat_class=LINE%2FRTE-X;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t45583\t45756\t460\t-\t.\tID=19535.GS22252505.PGEN_.repeat00000074;Name=19535.GS22252505.PGEN_.repeat00000074;repeat_match=rnd-5_family-33;repeat_class=LINE%2FL2;\n", "\n", "----------------------------------------------\n", "Parsing LTR from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "Parsing matching feature lines for LTR feature...\n", "Done with parsing LTR feature.\n", "Identified 3255 LTR features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t1275\t1466\t920\t-\t.\tID=19535.GS22252505.PGEN_.repeat00000005;Name=19535.GS22252505.PGEN_.repeat00000005;repeat_match=rnd-5_family-1106;repeat_class=LTR%2FGypsy;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t670854\t671005\t266\t+\t.\tID=19535.GS22252505.PGEN_.repeat00001290;Name=19535.GS22252505.PGEN_.repeat00001290;repeat_match=rnd-5_family-1106;repeat_class=LTR%2FGypsy;\n", "\n", "----------------------------------------------\n", "Parsing RC from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "Parsing matching feature lines for RC feature...\n", "Done with parsing RC feature.\n", "Identified 603 RC features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t2890875\t2891508\t5041\t-\t.\tID=19535.GS22252505.PGEN_.repeat00005106;Name=19535.GS22252505.PGEN_.repeat00005106;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3120337\t3120414\t311\t+\t.\tID=19535.GS22252505.PGEN_.repeat00005530;Name=19535.GS22252505.PGEN_.repeat00005530;repeat_match=rnd-1_family-384;repeat_class=RC%2FHelitron;\n", "\n", "----------------------------------------------\n", "Parsing Simple_repeat from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n", "Parsing matching feature lines for Simple_repeat feature...\n", "Done with parsing Simple_repeat feature.\n", "Identified 19865 Simple_repeat features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t8890\t9134\t853\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000022;Name=19535.GS22252505.PGEN_.repeat00000022;repeat_match=rnd-1_family-158;repeat_class=Simple_repeat;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t99891\t100132\t1511\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000197;Name=19535.GS22252505.PGEN_.repeat00000197;repeat_match=rnd-6_family-10;repeat_class=Simple_repeat;\n", "\n", "----------------------------------------------\n", "Parsing SINE from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "Parsing matching feature lines for SINE feature...\n", "Done with parsing SINE feature.\n", "Identified 43129 SINE features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t2928\t3030\t668\t-\t.\tID=19535.GS22252505.PGEN_.repeat00000010;Name=19535.GS22252505.PGEN_.repeat00000010;repeat_match=rnd-3_family-517;repeat_class=SINE%2FtRNA-Core-L2;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t3050\t3146\t311\t-\t.\tID=19535.GS22252505.PGEN_.repeat00000011;Name=19535.GS22252505.PGEN_.repeat00000011;repeat_match=rnd-1_family-48;repeat_class=SINE%2FtRNA-Core-L2;\n", "\n", "----------------------------------------------\n", "Parsing Unknown from Panopea-generosa-v1.0.a4.repeat_region.gff3...\n", "Writing GFF3 header to Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "Parsing matching feature lines for Unknown feature...\n", "Done with parsing Unknown feature.\n", "Identified 1510558 Unknown features.\n", "Output file is: Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "##gff-version 3\n", "##Generated using GenSAS, Tuesday 26th of November 2019 07:12:25 PM\n", "##Project Name : Pgenerosa_v074\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t265\t338\t334\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000002;Name=19535.GS22252505.PGEN_.repeat00000002;repeat_match=rnd-5_family-367;repeat_class=Unknown;\n", "Scaffold_01\tGenSAS_5d25089d78791-repeatmodeler\trepeat_region\t855\t1050\t451\t+\t.\tID=19535.GS22252505.PGEN_.repeat00000003;Name=19535.GS22252505.PGEN_.repeat00000003;repeat_match=rnd-5_family-1818;repeat_class=Unknown;\n", "\n", "----------------------------------------------\n", "----------------------------------------------\n", "\n", "total 1.9G\n", "-rw-rw-r-- 1 sam sam 53M Dec 4 13:14 Panopea-generosa-v1.0.a4.CDS.gff3\n", "-rw-rw-r-- 1 sam sam 55M Dec 4 13:14 Panopea-generosa-v1.0.a4.exon.gff3\n", "-rw-rw-r-- 1 sam sam 9.5M Dec 4 13:14 Panopea-generosa-v1.0.a4.gene.gff3\n", "-rw-rw-r-- 1 sam sam 486M Dec 4 13:13 Panopea-generosa-v1.0.a4.gff3\n", "-rw-rw-r-- 1 sam sam 9.1M Dec 4 13:14 Panopea-generosa-v1.0.a4.mRNA.gff3\n", "-rw-rw-r-- 1 sam sam 358M Dec 4 13:14 Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "-rw-rw-r-- 1 sam sam 5.2M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam sam 17M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam sam 726K Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam sam 136K Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam sam 4.4M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam sam 9.7M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam sam 322M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "-rw-rw-r-- 1 sam sam 1.4K Dec 4 13:15 Panopea-generosa-v1.0.a4.rRNA.gff3\n", "-rw-rw-r-- 1 sam sam 2.5M Dec 4 13:15 Panopea-generosa-v1.0.a4.tRNA.gff3\n", "-rw-rw-r-- 1 sam users 554M Dec 3 14:06 Panopea-generosa-vv0.74.a4-merged-2019-11-26-15-30-34.gff3\n", "-rw-rw-rw- 1 sam users 1.3K Jun 26 08:54 Pgenerosa_v074.fa.fai\n" ] } ], "source": [ "%%bash\n", "\n", "# Initialize array\n", "features_array=()\n", "\n", "# Identify unique features in GFF\n", "## Store as an array\n", "## Skip first three header lines, and then cut on two delimiters that are present\n", "echo \"Unique repeats features in ${repeats_gff}:\"\n", "while IFS='' read -r line \n", "do\n", " features_array+=(\"$line\")\n", "done < <(awk -F\"class=\" 'NR >3 {print $2}' \"${repeats_gff}\" \\\n", " | sort -u \\\n", " | cut -d '%' -f 1 \\\n", " | cut -d ';' -f 1 \\\n", " | uniq)\n", "\n", "\n", "# Check array contents\n", "for feature in \"${features_array[@]}\"\n", "do\n", " echo \"${feature}\"\n", "done\n", "\n", "echo \"\"\n", "echo \"----------------------------------------------\"\n", "echo \"\"\n", "\n", "# Loop through array and create new GFFs from each feature\n", "for feature in \"${features_array[@]}\"\n", "do\n", " echo \"Parsing ${feature} from ${repeats_gff}...\"\n", " echo \"Writing GFF3 header to ${new_repeats_file}.${feature}.gff3\"\n", " # Write header to new file\n", " head -n 3 \"${repeats_gff}\" > \"${new_repeats_file}\".\"${feature}\".gff3\n", " echo \"Parsing matching feature lines for ${feature} feature...\"\n", " grep \"repeat_class=${feature}\" \"${repeats_gff}\" >> \"${new_repeats_file}\".\"${feature}\".gff3\n", " echo \"Done with parsing ${feature} feature.\"\n", " # Count number of lines, excluding three line header (oddly, need to run tail --lines n+1, as +1 prints entire file)\n", " feature_count=$(tail --lines +4 \"${new_repeats_file}\".\"${feature}\".gff3 | wc -l)\n", " echo \"Identified ${feature_count} ${feature} features.\"\n", " echo \"Output file is: ${new_repeats_file}.${feature}.gff3\"\n", " head -n 5 \"${new_repeats_file}.${feature}.gff3\"\n", " echo \"\"\n", " echo \"----------------------------------------------\"\n", "done\n", "\n", "echo \"----------------------------------------------\"\n", "echo \"\"\n", "ls -lh" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Sequence length repeats stats" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "-------------------------\n", "percent 0.25\n", "sum 2315583.00\n", "mean 711.83\n", "min 11.00\n", "median 323.00\n", "max 6541.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "-------------------------\n", "percent 0.03\n", "sum 258182.00\n", "mean 429.59\n", "min 13.00\n", "median 464.00\n", "max 674.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n", "-------------------------\n", "percent 0.55\n", "sum 5138701.00\n", "mean 258.71\n", "min 6.00\n", "median 124.00\n", "max 5981.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "-------------------------\n", "percent 1.01\n", "sum 9497156.00\n", "mean 409.48\n", "min 11.00\n", "median 248.00\n", "max 7012.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "-------------------------\n", "percent 0.72\n", "sum 6737909.00\n", "mean 156.23\n", "min 11.00\n", "median 165.00\n", "max 934.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "-------------------------\n", "percent 3.19\n", "sum 30035624.00\n", "mean 395.53\n", "min 11.00\n", "median 226.00\n", "max 6604.00\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "-------------------------\n", "percent 32.04\n", "sum 3.018520e+08\n", "mean 1.998300e+02\n", "min 1.100000e+01\n", "median 1.450000e+02\n", "max 1.098100e+04\n", "Name: seqlength, dtype: float64\n", "\n", "\n", "\n", "-------------------------\n", "Repeats composition of genome (percent): 37.79\n" ] } ], "source": [ "total_repeats_percent = 0\n", "\n", "for file in os.listdir('.'):\n", " if fnmatch.fnmatch(file, 'Panopea-generosa-v1.0.a4.repeats*.gff3'):\n", " print('\\n' * 2)\n", " print(file)\n", " print(\"-------------------------\")\n", " # Import GFF.\n", " # Skip first five rows and file is tab-separated\n", " gff=pandas.read_csv(file, header=None, skiprows=5, sep=\"\\t\")\n", " # Rename columns\n", " gff.columns = gff_header\n", " # Subtract start value from end value.\n", " # Have to add 1 so that sequence length can't equal zero\n", " gff['seqlength'] = gff.apply(lambda position: position['end'] - position['start'] + 1, axis=1)\n", " gff_sum = gff['seqlength'].sum()\n", " \n", " total_repeats_percent += ind_repeats_percent(gff_sum)\n", " print (\"percent\" , ind_repeats_percent(gff_sum))\n", " \n", " # Apply functions in list to seqlength column\n", " gff_stats = gff['seqlength'].agg(['sum', 'mean', 'min', 'median', 'max'])\n", " \n", " print (gff_stats.round(2))\n", "print('\\n' * 2) \n", "print(\"-------------------------\")\n", "print (\"Repeats composition of genome (percent):\" , total_repeats_percent)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "total 1.3G\n", "-rw-rw-r-- 1 sam sam 53M Dec 4 13:14 Panopea-generosa-v1.0.a4.CDS.gff3\n", "-rw-rw-r-- 1 sam sam 55M Dec 4 13:14 Panopea-generosa-v1.0.a4.exon.gff3\n", "-rw-rw-r-- 1 sam sam 9.5M Dec 4 13:14 Panopea-generosa-v1.0.a4.gene.gff3\n", "-rw-rw-r-- 1 sam sam 486M Dec 4 13:13 Panopea-generosa-v1.0.a4.gff3\n", "-rw-rw-r-- 1 sam sam 9.1M Dec 4 13:14 Panopea-generosa-v1.0.a4.mRNA.gff3\n", "-rw-rw-r-- 1 sam sam 358M Dec 4 13:14 Panopea-generosa-v1.0.a4.repeat_region.gff3\n", "-rw-rw-r-- 1 sam sam 5.2M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", "-rw-rw-r-- 1 sam sam 17M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", "-rw-rw-r-- 1 sam sam 726K Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", "-rw-rw-r-- 1 sam sam 136K Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", "-rw-rw-r-- 1 sam sam 4.4M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n", "-rw-rw-r-- 1 sam sam 9.7M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", "-rw-rw-r-- 1 sam sam 322M Dec 4 13:19 Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", "-rw-rw-r-- 1 sam sam 1.4K Dec 4 13:15 Panopea-generosa-v1.0.a4.rRNA.gff3\n", "-rw-rw-r-- 1 sam sam 2.5M Dec 4 13:15 Panopea-generosa-v1.0.a4.tRNA.gff3\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "bash: line 1: fg: no job control\n" ] } ], "source": [ "%%bash\n", "%%bash\n", "mapfile -t downloads_array < <(echo ${downloads_list} | tr \" \" \"\\n\")\n", "\n", "for file in \"${downloads_array[@]}\"\n", "do\n", " rm \"${file}\"\n", "done\n", "\n", "ls -lh" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sending incremental file list\n", "./\n", "20191203_pgen_v074.a4_genome_feature_counts/\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.CDS.gff3\n", " 55,346,545 100% 110.59MB/s 0:00:00 (xfr#1, to-chk=14/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.exon.gff3\n", " 57,148,143 100% 55.44MB/s 0:00:00 (xfr#2, to-chk=13/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.gene.gff3\n", " 9,888,462 100% 8.81MB/s 0:00:01 (xfr#3, to-chk=12/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.gff3\n", " 509,354,871 100% 105.46MB/s 0:00:04 (xfr#4, to-chk=11/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.mRNA.gff3\n", " 9,450,148 100% 12.99MB/s 0:00:00 (xfr#5, to-chk=10/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.rRNA.gff3\n", " 1,388 100% 1.95kB/s 0:00:00 (xfr#6, to-chk=9/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeat_region.gff3\n", " 374,972,067 100% 87.71MB/s 0:00:04 (xfr#7, to-chk=8/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.DNA.gff3\n", " 5,376,409 100% 39.75MB/s 0:00:00 (xfr#8, to-chk=7/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.LINE.gff3\n", " 17,343,693 100% 41.56MB/s 0:00:00 (xfr#9, to-chk=6/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.LTR.gff3\n", " 743,047 100% 1.75MB/s 0:00:00 (xfr#10, to-chk=5/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.RC.gff3\n", " 138,457 100% 333.03kB/s 0:00:00 (xfr#11, to-chk=4/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.SINE.gff3\n", " 10,113,860 100% 19.45MB/s 0:00:00 (xfr#12, to-chk=3/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.Simple_repeat.gff3\n", " 4,538,129 100% 8.07MB/s 0:00:00 (xfr#13, to-chk=2/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.repeats.Unknown.gff3\n", " 336,719,168 100% 75.65MB/s 0:00:04 (xfr#14, to-chk=1/17)\n", "20191203_pgen_v074.a4_genome_feature_counts/Panopea-generosa-v1.0.a4.tRNA.gff3\n", " 2,548,814 100% 10.48MB/s 0:00:00 (xfr#15, to-chk=0/17)\n" ] } ], "source": [ "%%bash\n", "cd ..\n", "rsync --progress --archive --relative \\\n", "./20191203_pgen_v074.a4_genome_feature_counts \\\n", "gannet:/volume2/web/Atumefaciens" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 }