{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Let's start out by searching iDigBio for all of the records that have an \"associatedSequences\" field." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22161\n" ] } ], "source": [ "import idigbio\n", "\n", "api = idigbio.json()\n", "seq_query = {\"data.dwc:associatedSequences\": {\n", " \"type\":\"exists\"\n", " }\n", " }\n", "seq_count = api.count_records(rq=seq_query)\n", "print seq_count" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "It looks like only 22,161 records (out of 29,342,343 total records) have an entry for the \"associatedSequences\" field.\n", "\n", "Now let's try and grab all of the entries so that we can start checking out counts and formats." ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import csv\n", "\n", "PAGE_SIZE = 1000\n", "fields = [\"data.dwc:associatedSequences\", \"recordset\"]\n", "seq_query = {\"data.dwc:associatedSequences\": {\n", " \"type\":\"exists\"\n", " }\n", " }\n", "\n", "with open('idigbio_seqs_and_recordsets.tsv', 'wb') as outfile:\n", " tsvwriter = csv.writer(outfile, delimiter=\"\\t\")\n", " for offset in xrange(0,seq_count, PAGE_SIZE):\n", " seq_records = api.search_records(rq = seq_query, \n", " limit=PAGE_SIZE, offset=offset, \n", " fields=fields, fields_exclude=[])\n", " for seq_record in seq_records['items']:\n", " tsvwriter.writerow([seq_record['uuid'], \n", " seq_record['indexTerms']['recordset'], \n", " seq_record['data']['dwc:associatedSequences']])" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " recordset uuid \\\n", "specimen uuid \n", "661df25d-8663-4dd2-a9fb-12941e55efe6 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "1b5e679a-ced4-45f7-83c6-d81ba72b6ea6 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "c6a0379d-6919-4624-a54f-0bf1c7e82318 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "01eb3840-f246-49aa-93ca-71400df1a044 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "ac37322c-276d-4fbc-80d8-680510a17d8c fcbcb214-cd62-4453-af56-b4b49161a261 \n", "\n", " associatedSequence field \n", "specimen uuid \n", "661df25d-8663-4dd2-a9fb-12941e55efe6 http://www.ncbi.nlm.nih.gov/nuccore/JN042326; ... \n", "1b5e679a-ced4-45f7-83c6-d81ba72b6ea6 http://www.ncbi.nlm.nih.gov/nuccore/U65280 \n", "c6a0379d-6919-4624-a54f-0bf1c7e82318 http://www.ncbi.nlm.nih.gov/nuccore/GQ175238 \n", "01eb3840-f246-49aa-93ca-71400df1a044 http://www.ncbi.nlm.nih.gov/nuccore/U65270 \n", "ac37322c-276d-4fbc-80d8-680510a17d8c http://www.ncbi.nlm.nih.gov/nuccore/GQ175236 \n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "columns = [\"specimen uuid\", \"recordset uuid\",\"associatedSequence field\"]\n", "idigbio_sequences = pd.read_csv('idigbio_seqs_and_recordsets.tsv', names=columns, sep=\"\\t\", index_col=0)\n", "print idigbio_sequences.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import re\n", "\n", "def pull_out_gb_accessions_and_count(seq_string):\n", " pattern = re.compile(\"[a-zA-Z]{1,2}\\-?_?\\d{5,6}\")\n", " matching_accessions = re.findall(pattern, seq_string)\n", " if len(matching_accessions) > 0:\n", " gb_accs = \"|\".join(matching_accessions)\n", " return gb_accs, len(matching_accessions)\n", " return np.nan, 0" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "idigbio_sequences['extracted_accessions'], idigbio_sequences['sequence_count'] = \\\n", " zip(*idigbio_sequences['associatedSequence field'].map(pull_out_gb_accessions_and_count))" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " recordset uuid \\\n", "specimen uuid \n", "661df25d-8663-4dd2-a9fb-12941e55efe6 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "1b5e679a-ced4-45f7-83c6-d81ba72b6ea6 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "c6a0379d-6919-4624-a54f-0bf1c7e82318 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "01eb3840-f246-49aa-93ca-71400df1a044 fcbcb214-cd62-4453-af56-b4b49161a261 \n", "ac37322c-276d-4fbc-80d8-680510a17d8c fcbcb214-cd62-4453-af56-b4b49161a261 \n", "\n", " associatedSequence field \\\n", "specimen uuid \n", "661df25d-8663-4dd2-a9fb-12941e55efe6 http://www.ncbi.nlm.nih.gov/nuccore/JN042326; ... \n", "1b5e679a-ced4-45f7-83c6-d81ba72b6ea6 http://www.ncbi.nlm.nih.gov/nuccore/U65280 \n", "c6a0379d-6919-4624-a54f-0bf1c7e82318 http://www.ncbi.nlm.nih.gov/nuccore/GQ175238 \n", "01eb3840-f246-49aa-93ca-71400df1a044 http://www.ncbi.nlm.nih.gov/nuccore/U65270 \n", "ac37322c-276d-4fbc-80d8-680510a17d8c http://www.ncbi.nlm.nih.gov/nuccore/GQ175236 \n", "\n", " extracted_accessions \\\n", "specimen uuid \n", "661df25d-8663-4dd2-a9fb-12941e55efe6 JN042326|JN042327|JN042415|JN042723|JN042724|J... \n", "1b5e679a-ced4-45f7-83c6-d81ba72b6ea6 U65280 \n", "c6a0379d-6919-4624-a54f-0bf1c7e82318 GQ175238 \n", "01eb3840-f246-49aa-93ca-71400df1a044 U65270 \n", "ac37322c-276d-4fbc-80d8-680510a17d8c GQ175236 \n", "\n", " sequence_count \n", "specimen uuid \n", "661df25d-8663-4dd2-a9fb-12941e55efe6 9 \n", "1b5e679a-ced4-45f7-83c6-d81ba72b6ea6 1 \n", "c6a0379d-6919-4624-a54f-0bf1c7e82318 1 \n", "01eb3840-f246-49aa-93ca-71400df1a044 1 \n", "ac37322c-276d-4fbc-80d8-680510a17d8c 1 \n" ] } ], "source": [ "print idigbio_sequences.head()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "34185\n" ] } ], "source": [ "print idigbio_sequences['sequence_count'].sum()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtEAAAE9CAYAAADNpz5jAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3XtwlFWe//FPp9sICaHbDjchMoBhRgkSoiGIUYiM5a4X\nFFO7qVqXVQM4I6gMGXe5RNnCdR1lxpABJsosqOMFd5edkszWLlXOzhAYNqxLYMgiGcTE4FgpDCHd\nnXAzl0769weV/iWEdDj9JDz0+H5VWZV+0t98z0kO3Z9+fPq0IxQKhQQAAADgssXZPQAAAAAg1hCi\nAQAAAEOEaAAAAMAQIRoAAAAwRIgGAAAADBGiAQAAAEMRQ/Trr7+uJ598Us8991z42NmzZ/XSSy/p\nBz/4gf7xH/9R586dC39vx44dWrZsmZYvX67/+7//Cx+vra3Vc889p2XLluntt98OH29vb1dxcbGW\nLVum559/XqdOnRqQSVVVVX2jau3szZxjo9bO3sw5Nmrt7B2LtXb2Zs6xUWtnb+Z8ZWojhui7775b\nhYWFPY6VlpZq2rRp2rBhg6ZOnarS0lJJUl1dnfbt26f169ersLBQW7duVdcW1Fu2bNGSJUu0ceNG\n1dfXq7KyUpK0a9cuJSUlaePGjXrggQe0bdu2qCZxsVj7I1ittbM3c46NWjt7M+fYqLWzdyzW2tmb\nOcdGrZ29mfOVqY0Yom+++WYlJib2OHbgwAHNmTNHkpSTk6OKigpJUkVFhbKzs+VyuTRq1CiNGTNG\n1dXVCgQCamlpUWpqqiRp9uzZ2r9/f6+fNXPmTH3yySdRTQIAAAC4koyviW5ubpbH45Ekud1uNTc3\nS5ICgYCSk5PD90tOTpbf71cgEJDX6w0f93q98vv9kiS/3x+ucTqdSkhI0NmzZ6OfDQAAAHAFOPr7\n2O+GhgatW7dORUVFkqT8/Pwe1zV33X7rrbc0efJk3XXXXZKkzZs3a/r06Ro1apS2bdumNWvWSJKO\nHj2qX/3qV1q1apWee+45Pf/88+GQ/eyzz+qVV17RsGHDeoyhqqqqx6n2vLy8AZg6AAAA0L/t27eH\nv05LS1NaWppcpj/E7XarqalJHo9HgUBAbrdb0oUzzD6fL3w/n8+n5OTkHmeeux/vqmlsbJTX61VH\nR4fOnz/fK0B3H2x3J06c6HOMSUlJOnPmjOnUYrbWzt7MOTZq7ezNnGOj1s7esVhrZ2/mHBu1dvZm\nzgNbO3bs2EuewDW+nCMzM1O7d++WJO3Zs0czZswIHy8vL1cwGFRDQ4Pq6+uVmpoqj8ejoUOHqrq6\nWqFQSHv37u1Rs2fPHknSxx9/rFtuucV0OAAAAMAVF/FM9E9/+lMdPXpUp0+f1pIlS5SXl6f58+er\nuLhYZWVlGjlypAoKCiRJKSkpmjVrlgoKCuR0OrVo0SI5HA5J0uLFi1VSUqK2tjZlZGRo+vTpkqS5\nc+dq06ZNWrZsmZKSkvSDH/xgkKcLAAAAWBcxRC9fvvySx7uub75Ybm6ucnNzex2fNGlS+Jrq7q65\n5hr98Ic/vJxxAgAAAFcNPrEQAAAAMGT8xsKrkavZr1DjyfDtVqdLzo5g+LZjxGgF3d5LlQIAAADG\n/iRCdKjxpNpeXdnn9+NXrZMI0QAAABggXM4BAAAAGCJEAwAAAIYI0QAAAIAhQjQAAABgiBANAAAA\nGCJEAwAAAIYI0QAAAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI0QAAAIAhQjQAAABgiBANAAAAGCJE\nAwAAAIYI0QAAAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI0QAAAIAhQjQAAABgiBANAAAAGCJEAwAA\nAIYI0QAAAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI0QAAAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI\n0QAAAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI0QAAAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI0QAA\nAIAhQjQAAABgiBANAAAAGCJEAwAAAIYI0QAAAIAhQjQAAABgyBVt4Y4dO7R37145HA6NHz9eS5cu\nVWtrq4qLi9XY2KiRI0eqoKBAiYmJ4fuXlZUpLi5O+fn5Sk9PlyTV1taqpKRE7e3tysjIUH5+/sDM\nDAAAABgkUZ2Jbmho0G9/+1utW7dORUVF6uzsVHl5uUpLSzVt2jRt2LBBU6dOVWlpqSSprq5O+/bt\n0/r161VYWKitW7cqFApJkrZs2aIlS5Zo48aNqq+vV2Vl5cDNDgAAABgEUYXohIQEOZ1Otba2qqOj\nQ62trfJ6vTpw4IDmzJkjScrJyVFFRYUkqaKiQtnZ2XK5XBo1apTGjBmj6upqBQIBtbS0KDU1VZI0\ne/Zs7d+/f4CmBgAAAAyOqC7nGDZsmObNm6elS5cqPj5e6enpmjZtmpqbm+XxeCRJbrdbzc3NkqRA\nIKDJkyeH65OTk+X3++VyueT1esPHvV6v/H6/lfkAAAAAgy6qEF1fX6///M//VElJiRISErR+/Xr9\n7ne/63Efh8MxIAOUpKqqKlVVVYVv5+XlKSkpKXy71Rl5Gk6nSwnd7h9JfHx8j59twq5aO3sz59io\ntbM3c46NWjt7x2Ktnb2Zc2zU2tmbOQ987fbt28Nfp6WlKS0tLboQXVtbq+985zvhhjNnztRnn30m\nj8ejpqYmeTweBQIBud1uSRfOMPt8vnC9z+dTcnJyrzPPPp+vx5npiwfb3ZkzZ8JfOzuCEcfb0RHs\ncf9IkpKSLvu+V0utnb2Zc2zU2tmbOcdGrZ29Y7HWzt7MOTZq7ezNnAe2NikpSXl5eb2OR3VN9Nix\nY1VdXa22tjaFQiEdPnxYKSkpuu2227R7925J0p49ezRjxgxJUmZmpsrLyxUMBtXQ0KD6+nqlpqbK\n4/Fo6NChqq6uVigU0t69e5WVlRXNkAAAAIArJqoz0RMmTNDs2bO1atUqORwOTZw4Uffcc49aWlpU\nXFyssrKy8BZ3kpSSkqJZs2apoKBATqdTixYtCl/usXjxYpWUlKitrU0ZGRmaPn36wM0OAAAAGARR\n7xP98MMP6+GHH+5xbNiwYVqzZs0l75+bm6vc3NxexydNmqSioqJohwEAAABccXxiIQAAAGCIEA0A\nAAAYIkQDAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAYIkQDAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAY\nIkQDAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAYIkQDAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAYIkQD\nAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAYIkQDAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAYIkQDAAAA\nhgjRAAAAgCFCNAAAAGCIEA0AAAAYIkQDAAAAhgjRAAAAgCFCNAAAAGCIEA0AAAAYctk9ALu5mv0K\nNZ4M3251uuTsCIZvO0aMVtDttWNoAAAAuEp940N0qPGk2l5d2ef341etkwjRAAAA6IbLOQAAAABD\nhGgAAADAECEaAAAAMESIBgAAAAwRogEAAABDhGgAAADAECEaAAAAMESIBgAAAAwRogEAAABDhGgA\nAADAECEaAAAAMOSKtvDcuXPavHmz6urqJElLly7V9ddfr+LiYjU2NmrkyJEqKChQYmKiJGnHjh0q\nKytTXFyc8vPzlZ6eLkmqra1VSUmJ2tvblZGRofz8/AGYFgAAADB4oj4T/fbbbysjI0PFxcV67bXX\nNG7cOJWWlmratGnasGGDpk6dqtLSUklSXV2d9u3bp/Xr16uwsFBbt25VKBSSJG3ZskVLlizRxo0b\nVV9fr8rKyoGZGQAAADBIogrR58+f16effqq5c+dKkpxOpxISEnTgwAHNmTNHkpSTk6OKigpJUkVF\nhbKzs+VyuTRq1CiNGTNG1dXVCgQCamlpUWpqqiRp9uzZ2r9//0DMCwAAABg0UV3O0dDQoOHDh+v1\n11/XH//4R02cOFFPPPGEmpub5fF4JElut1vNzc2SpEAgoMmTJ4frk5OT5ff75XK55PV6w8e9Xq/8\nfr+V+QAAAACDLqoQ3dHRoePHj2vhwoVKTU3VL37xi/ClG10cDseADFCSqqqqVFVVFb6dl5enpKSk\n8O1WZ+RpOJ0uJXS7f3dWai8WHx/fY1wmrNTa2Zs5x0atnb2Zc2zU2tk7Fmvt7M2cY6PWzt7MeeBr\nt2/fHv46LS1NaWlp0YXo5ORkeb3e8GUYt99+u3bs2CGPx6OmpiZ5PB4FAgG53W5JF84w+3y+cL3P\n5wv/jO5nnn0+X48z0xcPtrszZ86Ev3Z2BCOOt6Mj2OP+3VmpvVhSUtJl33cga+3szZxjo9bO3sw5\nNmrt7B2LtXb2Zs6xUWtnb+Y8sLVJSUnKy8vrdTyqa6I9Ho9GjBihEydOSJIOHz6sG264Qbfddpt2\n794tSdqzZ49mzJghScrMzFR5ebmCwaAaGhpUX1+v1NRUeTweDR06VNXV1QqFQtq7d6+ysrKiGRIA\nAABwxUS9xV1+fr42bdqkYDCo0aNHa+nSpers7FRxcbHKysrCW9xJUkpKimbNmqWCggI5nU4tWrQo\nfLnH4sWLVVJSora2NmVkZGj69OkDMzMAAABgkEQdoidMmKBXXnml1/E1a9Zc8v65ubnKzc3tdXzS\npEkqKiqKdhgAAADAFccnFgIAAACGCNEAAACAIUI0AAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0\nAAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAA\nYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYIgQ\nDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYIgQDQAA\nABgiRAMAAACGCNEAAACAIUI0AAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYIgQDQAAABgi\nRAMAAACGCNEAAACAIUI0AAAAYIgQDQAAABgiRAMAAACGCNEAAACAIUI0AAAAYMhlpbizs1OrVq2S\n1+vVqlWrdPbsWRUXF6uxsVEjR45UQUGBEhMTJUk7duxQWVmZ4uLilJ+fr/T0dElSbW2tSkpK1N7e\nroyMDOXn51ufFQAAADCILJ2J3rlzp1JSUuRwOCRJpaWlmjZtmjZs2KCpU6eqtLRUklRXV6d9+/Zp\n/fr1Kiws1NatWxUKhSRJW7Zs0ZIlS7Rx40bV19ersrLS4pQAAACAwRV1iPb5fDp06JDmzp0bDsQH\nDhzQnDlzJEk5OTmqqKiQJFVUVCg7O1sul0ujRo3SmDFjVF1drUAgoJaWFqWmpkqSZs+erf3791ud\nEwAAADCoog7R77zzjhYsWKC4uP//I5qbm+XxeCRJbrdbzc3NkqRAIKDk5OTw/ZKTk+X3+xUIBOT1\nesPHvV6v/H5/tEMCAAAAroiorok+ePCghg8frokTJ6qqquqS9+m6xGMgVFVV9eiTl5enpKSk8O1W\nZ+RpOJ0uJXS7f3dWai8WHx/fY1wmrNTa2Zs5x0atnb2Zc2zU2tk7Fmvt7M2cY6PWzt7MeeBrt2/f\nHv46LS1NaWlp0YXoY8eO6eDBgzp06JDa29v19ddfa9OmTXK73WpqapLH41EgEJDb7ZZ04Qyzz+cL\n1/t8PiUnJ/c68+zz+Xqcmb54sN2dOXMm/LWzIxhxvB0dwR73785K7cWSkpIu+74DWWtnb+YcG7V2\n9mbOsVFrZ+9YrLWzN3OOjVo7ezPnga1NSkpSXl5er+NRXc7x6KOP6o033lBJSYmWL1+utLQ0Pfvs\ns8rMzNTu3bslSXv27NGMGTMkSZmZmSovL1cwGFRDQ4Pq6+uVmpoqj8ejoUOHqrq6WqFQSHv37lVW\nVlY0QwIAAACuGEtb3HXpunRj/vz5Ki4uVllZWXiLO0lKSUnRrFmzVFBQIKfTqUWLFoVrFi9erJKS\nErW1tSkjI0PTp08fiCEBAAAAg8ZyiJ4yZYqmTJkiSRo2bJjWrFlzyfvl5uYqNze31/FJkyapqKjI\n6jAAAACAK4ZPLAQAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAh\nGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAA\nADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBE\niAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYA\nAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMueweQCxzNfsVajwZvt3qdMnZEQzfdowYraDb\na8fQAAAAMIgI0RaEGk+q7dWVfX4/ftU6iRANAADwJ4fLOQAAAABDhGgAAADAECEaAAAAMESIBgAA\nAAwRogEAAABDhGgAAADAECEaAAAAMBTVPtGNjY0qKSlRc3OzHA6Hvvvd7+r+++/X2bNnVVxcrMbG\nRo0cOVIFBQVKTEyUJO3YsUNlZWWKi4tTfn6+0tPTJUm1tbUqKSlRe3u7MjIylJ+fP3CzAwAAAAZB\nVGeiXS6XHn/8ca1fv14vv/yyPvroI9XV1am0tFTTpk3Thg0bNHXqVJWWlkqS6urqtG/fPq1fv16F\nhYXaunWrQqGQJGnLli1asmSJNm7cqPr6elVWVg7c7AAAAIBBEFWI9ng8mjBhgiRpyJAhGjdunPx+\nvw4cOKA5c+ZIknJyclRRUSFJqqioUHZ2tlwul0aNGqUxY8aourpagUBALS0tSk1NlSTNnj1b+/fv\nH4BpAQAAAIPH8jXRDQ0N+uKLLzR58mQ1NzfL4/FIktxut5qbmyVJgUBAycnJ4Zrk5GT5/X4FAgF5\nvf//Y7G9Xq/8fr/VIQEAAACDKqproru0tLSoqKhITzzxhIYOHdrjew6Hw9LAuquqqlJVVVX4dl5e\nnpKSksK3W52Rp+F0upTQ7f7d2VV7sfj4+B5zMmWlPhZr7ewdi7V29mbOsVFrZ+9YrLWzN3OOjVo7\nezPnga/dvn17+Ou0tDSlpaVFH6KDwaCKioo0e/ZsZWVlSbpw9rmpqUkej0eBQEBut1vShTPMPp8v\nXOvz+ZScnNzrzLPP5+txZvriwXZ35syZ8NfOjmDEsXZ0BHvcvzu7ai+WlJR02fcd6PpYrLWzdyzW\n2tmbOcdGrZ29Y7HWzt7MOTZq7ezNnAe2NikpSXl5eb2OR3U5RygU0ubNmzVu3Dg98MAD4eOZmZna\nvXu3JGnPnj2aMWNG+Hh5ebmCwaAaGhpUX1+v1NRUeTweDR06VNXV1QqFQtq7d284kAMAAABXq6jO\nRB87dkx79+7V+PHjtWLFCknSo48+qvnz56u4uFhlZWXhLe4kKSUlRbNmzVJBQYGcTqcWLVoUvtxj\n8eLFKikpUVtbmzIyMjR9+vQBmhoAAAAwOKIK0TfddJP+9V//9ZLfW7NmzSWP5+bmKjc3t9fxSZMm\nqaioKJphAAAAALbgEwsBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAA\nADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBE\niAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYA\nAAAMEaIBAAAAQ4RoAAAAwBAhGgAAADBEiAYAAAAMEaIBAAAAQy67B/BN5mr2K9R4UpLU6nTJ2REM\nf88xYrSCbq9dQwMAAEAEhGgbhRpPqu3VlZf8XvyqdRIhGgAA4KrE5RwAAACAIUI0AAAAYIgQDQAA\nABgiRAMAAACGCNEAAACAIXbniFHdt8eTem6R19/2eJFqL6ceAADgm44QHaOsbI8XqfZy6gEAAL7p\nuJwDAAAAMESIBgAAAAwRogEAAABDhGgAAADAECEaAAAAMESIBgAAAAyxxR2MsMc0AAAAIRqG2GMa\nAACAEI0rrPuZbNOz2FY+pREAAGAgXRUhurKyUr/4xS/U2dmpuXPnav78+XYPCYNksD5psb9aLkMB\nAAADyfYQ3dnZqTfffFNr1qyR1+vV6tWrlZmZqZSUFLuHhj8hVi5DIYADAICL2R6ia2pqNGbMGI0a\nNUqSlJ2drQMHDhCicdUggAMAgIvZHqL9fr+Sk5PDt71er2pqamwcETBwrL4R08o15FbYFf6t9uW6\neQDAleIIhUIhOwfw8ccfq7KyUk899ZQk6Xe/+51qamq0cOHC8H2qqqpUVVUVvp2Xl3fFxwkAAIBv\npu3bt4e/TktLU1pamv0ftuL1euXz+cK3fT6fvN6eZ4vS0tKUl5cX/q8/3SdqKhZr7ezNnGOj1s7e\nzDk2au3sHYu1dvZmzrFRa2dv5jzwtd1zaFpamqSr4BMLb7zxRtXX16uhoUHBYFD79u1TZmam3cMC\nAAAA+mT7NdFOp1MLFy7Uyy+/HN7ijjcVAgAA4GrmXLt27Vq7B3H99dfrvvvu0/3336+bb755QH5m\n124f35RaO3sz59iotbM3c46NWjt7x2Ktnb2Zc2zU2tmbOQ9+re1vLAQAAABije3XRAMAAACxhhAN\nAAAAGCJEAwAAAIZs353Dqrq6Oh04cEB+v1/ShX2nMzMzB32Hj7q6OgUCAU2ePFlDhgwJH6+srNT0\n6dMj1n766acaNmyYUlJSVFVVpc8//1wTJ07ULbfcEtVYfvazn+mZZ54xrjt69Khqamo0fvx4paen\nR7zvZ599ppSUFCUkJKi1tVWlpaU6fvy4UlJSlJubq4SEhD5rd+7cqaysLI0YMcJ4jJLU3t6u8vJy\neb1eTZs2TXv37tWxY8eUkpKie+65Ry5X5GVcX1+v/fv3y+fzyeFwaOzYsbrzzjsjjhkAACCSmH5j\nYWlpqcrLy5WdnR3+gBafz6d9+/bpjjvu0COPPBLVzy0rK9Pdd9/d5/d37typjz76SOPGjdMXX3yh\nJ554QllZWZKkFStW6Mc//nGftR988IGqqqrU2dmptLQ0HT16VBkZGfrkk09022236aGHHoo4tldf\nfVUOh0Pd/2xVVVVKS0uTw+HQypV9f8T06tWr9corr0iSfvOb3+ijjz5SVlaWDh8+rFtvvTXi76ug\noECvvfaanE6nNm/erCFDhuj222/X4cOH9eWXX+pv//Zv+6x9/PHHde2112r06NG68847NWvWLA0f\nPjziPLvbsGGDOjs71draqsTERLW0tGjmzJk6fPiwJEV8AbFz504dPHhQU6ZM0e9//3tNnDhRCQkJ\nqqio0KJFizR16tTLHsc3VXNzs9xu9xXve+bMGSUlJV3xvlfKuXPnVFpaqoqKCjU3N0uS3G63ZsyY\nofnz5ysxMTGqn/ujH/1IhYWFEe9z/vx57dixQz6fT7feeqvuvPPO8Pe2bt2qxYsX91nb2NioX/7y\nl0pKStL8+fP1zjvv6PPPP9eECRP02GOPGa+Vy11f3U9QnDt3Tu+++274JMDjjz8uj8dj1Ndkfa1Y\nsUIzZ85Udna2xowZY9SnpqZG27Zt03XXXadHH31Ub7zxhmpqajR27Fh973vf08SJE/us/frrr/Xv\n//7v+t///V/5fD65XC6NHj1a9957r3JyciL2Haz1JfW/xlhfF1zuGrNrjXwTDfT6iukz0bt27dL6\n9et7nYmcN2+eCgoKog7R27dvjxiif/vb32rdunUaMmSIGhoatH79ep06dUoPPPBAvz+7oqJCr732\nmoLBoJ588klt3rxZCQkJeuihh1RYWNhviPb7/UpJSdHcuXMVFxenUCik2trafuskqaOjI/z1b37z\nG61Zs0bDhw/XvHnz9Pzzz/f7+3I6nZKk48ePa926dZKkm266SX/3d38XsW706NF69dVX9cknn6i8\nvFzbt2/XpEmTlJ2drZkzZ2ro0KER67/88ksVFRWpo6ND3//+9/Xzn/9cTqdTd911V8TwLl34W/3k\nJz9RXFycHnzwQf3oRz/Siy++qHvvvVfr1q3TT37ykz5r7Qo5Vp6AJGsPEmfPnu1xOxQKqbCwMPz3\nHjZsWJ+1Vp6E3n//fT300EMaPny4Pv/8cxUXF8vhcCgYDOqZZ54JfzpUX2Ix5BQXF2vq1Klau3at\n3G63HA6HAoGA9uzZo+LiYr3wwgt91tbW1vb5vePHj/c759dff13XX3+9br/9du3atUsff/yxli1b\npvj4eH322Wf91s6YMUMtLS16/vnnlZOTo7y8PFVUVGjLli0R/01aWV8ffPBBeH29++67uu6667Ry\n5Urt379f//RP/6QVK1b0WWt1fZ07d07nzp3Tiy++KI/Ho+zsbN1xxx29Pl33Ut58803l5eXp3Llz\neuGFF/T444/rhRde0JEjR7R161a9/PLLfdZu3LhRWVlZKiws1Mcff6yWlhbdcccd+vDDD3XixAk9\n+uijfdZaWV+StTX2TVtfkrU1ZtcasfOFlpXnCyuPu1bW16XEdIiOi4uT3+/vtbef3+9XXFzky72f\ne+65Pr/XtZj6EgqFwpdwjBo1SmvXrlVRUZFOnTql/k7su1wuOZ1OOZ1OjR49OnxJQXx8vBwOR8Ra\n6cKZ6J07d2rHjh1asGCBJk6cqGuuuUZTpkzpt7azs1Nnz55VKBRSZ2dn+GzwkCFD+v19paSkaNeu\nXZo7d66+9a1vqaamRqmpqTpx4kS/l1NIF/5W6enpSk9PVzAY1KFDh1ReXq733ntPb775ZsTaUCik\n9vZ2tba2qq2tTefPn1dSUpLa29v7/X1LUjAYVHx8fPhnSNKIESN6vKi4FLtCjpUnoK76aB8kFi1a\n1OuyG7/fr5UrV8rhcOhnP/tZn7VWnoQOHTqkBQsWSJLee+89LV++PLy+NmzYEH4S7EsshpxTp05p\n/vz5PY60x3T2AAAJH0lEQVRdd911mj9/vsrKyiKOefXq1X3+mz9//nzEWkk6efJkeB1kZWXpww8/\n1EsvvdTvC2Lpwtm1++67T5L061//OjyH++67T7t27YpYa2V9dVdbW6sf//jHcjgcevDBB7Vnz56I\n97e6vhITE/XYY4/pb/7mb3T06FGVl5dr5cqVSklJUXZ2tu65554+azs6OpSRkSFJ2rZtm2bNmiVJ\nuuWWW/Tuu+9G7Hvq1KnwCZ0HH3xQq1at0l/8xV9o6dKlKigoGLT1JVlbY9+09SVZW2N2rRE7X2hZ\neb6w8rhrZX1dSkyH6CeeeEIvvfSSxowZo+TkZEkXLueor6/XwoULI9aePn1ahYWFl3yltWbNmoi1\nbrdbX3zxhSZMmCDpQghduXKlNm/erC+//DJi7TXXXKPW1lZde+21Pf5RnTt3rt8gKyl8RnXWrFl6\n5513NHz4cHV2dvZbJ1149dZ1uUfXP5brrrtOX3/9db+1Tz31lN5++219+OGHGj58uNasWSOv16sR\nI0bo+9///mX17+JyuTRjxoxw0OvPXXfdpYKCAjmdTj322GP6+7//e33nO99RdXW15syZE7H2u9/9\nrlavXq3Jkyfr6NGjevjhhyVdeKHU3/9msyvkWHkCkqw9SCxYsECHDx/WggUL9K1vfUuS9PTTT6uk\npOSyencxfRLq7OxUMBiUy+VSe3u7UlNTJUljx45VMBjst18shpwRI0boV7/6lebMmRM+69LU1KTd\nu3f3+/6BcePG6cknn9TYsWN7fW/JkiURa6ULLyw7OzvDjzm5ubnyer1au3btZf2b7DJ79uwet/t7\nLLKyvk6fPq3/+I//UCgU6vVvqL8X01bXVxeHw6EpU6ZoypQpWrhwoT755BPt27cv4vpyOp2qrKwM\nj3n//v3KysrSH/7wB11zzTUR+1177bU6evSobr75ZlVUVIQfsy7nucLK+pKsrbFv2vrqGlu0a8yu\nNWLnC63uTJ8vrDzudme6vi4lpkP09OnTtWHDBtXU1PR4Y+GNN94YvvSgLxkZGWppabnk/6rt71MT\nn3766V5nX10ul55++umID6aStHbtWsXHx0vqucg7Ojr09NNPR6ztLjk5WT/84Q918ODBy36DXF8P\nJHFxcf0GtMTERD3zzDM6f/68Ghoa1NHRoeTk5Mu6Tmz58uV9fq/7mzL78sgjjyg7O1sJCQkaNmyY\npk6dqtraWv35n/95+IVMX+6//35NnTpVJ06c0Lx58zRu3DhJF14IvfjiixFr7Qo5A/UEJJk/SMyb\nN0+zZs3Su+++K6/Xq7y8vMvuZeVJ6N5779Urr7yiRx55ROnp6Xr77bc1c+ZMHTlypN+/cXexFHIK\nCgpUWlqqtWvX9vhfqZmZmSooKIhY+5d/+Zd9/k7z8/P77X3rrbfqyJEjmjZtWvhYTk6OPB6P3nrr\nrYi1t912m77++msNHTpUf/VXfxU+/tVXX11yvXdnZX3NnTs3/II/JydHp0+fltvtViAQ6HeNWF1f\n119/fa9jTqdT06dP7/eN5IsWLdL7778vj8ejf/iHf9DmzZtVUlKiMWPG6Hvf+17E2ieffFI///nP\n9dVXX+mGG24IP3acPn1af/Znfxax1sr6kqytscFYX/X19basr6ampstaI1bW2KJFi7Rt2za53e4r\nukbsfKFl5fnCyuOulfV1KTH9xkJgMJ09e1alpaU6cOBAryeh+fPnR7y+7n/+5380fvz4cGjvriuk\n9eW9995Tenp6jycg6cI1ZG+99ZY2btwYcdz/8i//oocffrjXteZfffWVPvjgg4iXMnVXUVGhHTt2\n6NSpU9qyZUu/99++fXuPS5LuvffecMjZtm1bvzvIHDlyRL/+9a9VX18ffpE2Y8YM3X333f1eMvTT\nn/404ou1SGpqasIh56//+q+1efNm1dTUhJ/Abrzxxj5rv/jii15PYGPHjtXp06f13//937r//vsj\n9q6rq5Pf79fkyZN7/L0uZ5efvnYIOnToUPjM+mDVW+0tma+vrr5+v1/f/va3jfseOXJE//Vf/6Wv\nvvrKeH119Y52RyYr4+5rjVxO7aeffqrExETdcMMNOnLkiGpra412grKyk5RdtV0qKipUWlqqhoaG\ny15f3X9f0fS1ssYu1XvChAm9ngcGctxWnuMka89zXc8XoVBIDoejx/PF+++/r2effbbPWquPu92Z\n7FJ2KYRoIAr97eASSde15Ve6Npr61tZWnTx5UuPHj7dtzlb6Wq0fzDlb2eXHSq3dvbuHUYfDEV5f\n/YXCgegb7QsWu35fdu4EZaXerlqpZwD//e9/r+PHj+vb3/52v4HSat+u3tGEWTt/XxfbtGlTxADb\nn2i33L1StVZ2KbuUmL6cA7BLfzu4RPJv//ZvUQdKK7XR1F977bUaP368JPvmbKWv1frBnPPFu/x0\nvTn5cnb5sbJDkNV6K7XdQ+Ebb7zRIxT+8z//c8QQPVB9N2/e3KNv9zc4DUZvu2qt7gRlpd6u2r4C\n5S9/+UsdP3580Pra2dtKbV9b5nYdj7RlbqT6s2fP9ltvV63VXcouRogG+hDpsoempqarsnYwe/e3\na41dfa3W2/W3uniXnxdffPGyd/mxskOQ1XortVZCoV19rfa2q9bqTlBW6u2qtRIorf6+7OptpdbK\nlrlW6yPV9re2rfS1skvZpRCigT5Y2cHFrlo7ezNns1oru/xYqbWzt5VQaFdfq73tqrW6E5SVertq\nrQRKq78vu3pbqbWyZa7VertqrexSdkkhAJdUUlIS+sMf/nDJ7xUXF1+VtXb2Zs5mtadOnQoFAoFe\nxzs7O0NHjx4dtFo7e69duzZ0/PjxHsfa29tDmzZtCuXl5V2Vfa32tqu2tbX1ksebm5tDf/zjHyPW\nWq23q3b16tWhlpaWUCgUCnV0dISPnz17NrRixYpB62tnb6vjDoVCocbGxlBRUVFoy5Ytoaeeeuqy\nagaq3q7ai7W0tIROnjxpXMcbCwEAV0RjY6NcLlevrTFDoZCOHTumm2666U+qL66stra28Bay3Z0+\nfVpNTU3h93f8qfUeKAcPHtSxY8cue5/lgay3q9YqQjQAAABgyPwqagAAAOAbjhANAAAAGCJEAwAA\nAIYI0QAAAICh/wdIVqEe1jYQvwAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "%matplotlib inline\n", "\n", "from matplotlib import pyplot as plt\n", "plt.rcParams['figure.figsize'] = 12, 5\n", "\n", "plt.style.use('ggplot')\n", "\n", "vc = idigbio_sequences['sequence_count'].value_counts()\n", "vc = vc.sort_index()\n", "\n", "vc.plot(kind='bar')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.9" } }, "nbformat": 4, "nbformat_minor": 0 }