{ "metadata": { "name": "", "signature": "sha256:22f4a1e046ea198271b43dbd7f96a82ad80996a9c4a0f6a3a49a6925bfa7ca25" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "from __future__ import unicode_literals\n", "import json\n", "import numpy as np\n", "import pandas as pd\n", "from pandas import DataFrame, Series" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 1 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Importing data dumped from the API" ] }, { "cell_type": "code", "collapsed": false, "input": [ "with open('biotech500.json', 'rb') as fp:\n", " data = json.load(fp)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 2 }, { "cell_type": "code", "collapsed": false, "input": [ "print data.keys()\n", "print data['response'].keys()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[u'response']\n", "[u'start', u'maxScore', u'numFound', u'docs']\n" ] } ], "prompt_number": 3 }, { "cell_type": "code", "collapsed": false, "input": [ "len(data['response']['docs'])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 4, "text": [ "500" ] } ], "prompt_number": 4 }, { "cell_type": "code", "collapsed": false, "input": [ "print data['response']['docs'][0].keys()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[u'journal', u'article_type', u'score', u'publication_date', u'eissn', u'id']\n" ] } ], "prompt_number": 5 }, { "cell_type": "code", "collapsed": false, "input": [ "for k in data['response']['docs'][0].keys():\n", " print data['response']['docs'][0][k], type(data['response']['docs'][0][k])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "PLoS ONE \n", "Research Article \n", "1.2119352 \n", "2014-01-29T00:00:00Z \n", "1932-6203 \n", "10.1371/journal.pone.0086174/title \n" ] } ], "prompt_number": 6 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## DataFraming" ] }, { "cell_type": "code", "collapsed": false, "input": [ "articles_list = data['response']['docs']\n", "articles = DataFrame(articles_list)\n", "articles.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abstractarticle_typeauthor_displayeissnidjournalpublication_datescoretitle_display
0 NaN Research Article NaN 1932-6203 10.1371/journal.pone.0086174/title PLoS ONE 2014-01-29T00:00:00Z 1.211935 NaN
1 NaN Research Article NaN 1932-6203 10.1371/journal.pone.0086174/abstract PLoS ONE 2014-01-29T00:00:00Z 1.211935 NaN
2 NaN Research Article NaN 1932-6203 10.1371/journal.pone.0086174/references PLoS ONE 2014-01-29T00:00:00Z 1.211935 NaN
3 NaN Research Article NaN 1932-6203 10.1371/journal.pone.0086174/body PLoS ONE 2014-01-29T00:00:00Z 1.211935 NaN
4 NaN Research Article NaN 1932-6203 10.1371/journal.pone.0086174/introduction PLoS ONE 2014-01-29T00:00:00Z 1.211935 NaN
\n", "

5 rows \u00d7 9 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 7, "text": [ " abstract article_type author_display eissn \\\n", "0 NaN Research Article NaN 1932-6203 \n", "1 NaN Research Article NaN 1932-6203 \n", "2 NaN Research Article NaN 1932-6203 \n", "3 NaN Research Article NaN 1932-6203 \n", "4 NaN Research Article NaN 1932-6203 \n", "\n", " id journal publication_date \\\n", "0 10.1371/journal.pone.0086174/title PLoS ONE 2014-01-29T00:00:00Z \n", "1 10.1371/journal.pone.0086174/abstract PLoS ONE 2014-01-29T00:00:00Z \n", "2 10.1371/journal.pone.0086174/references PLoS ONE 2014-01-29T00:00:00Z \n", "3 10.1371/journal.pone.0086174/body PLoS ONE 2014-01-29T00:00:00Z \n", "4 10.1371/journal.pone.0086174/introduction PLoS ONE 2014-01-29T00:00:00Z \n", "\n", " score title_display \n", "0 1.211935 NaN \n", "1 1.211935 NaN \n", "2 1.211935 NaN \n", "3 1.211935 NaN \n", "4 1.211935 NaN \n", "\n", "[5 rows x 9 columns]" ] } ], "prompt_number": 7 }, { "cell_type": "code", "collapsed": false, "input": [ "# We got more abstracts this time.\n", "articles = articles[articles['abstract'].notnull()]\n", "print len(articles)\n", "articles.head()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "60\n" ] }, { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
abstractarticle_typeauthor_displayeissnidjournalpublication_datescoretitle_display
7 [\\nThe objective of this paper is to assess th... Research Article [Latifah Amin, Md. Abul Kalam Azad, Mohd Hanaf... 1932-6203 10.1371/journal.pone.0086174 PLoS ONE 2014-01-29T00:00:00Z 1.211935 Determinants of Public Attitudes to Geneticall...
16 [\\n Atrazine (ATZ) and S-metolachlor (S... Research Article [Cristina A. Viegas, Catarina Costa, Sandra An... 1932-6203 10.1371/journal.pone.0037140 PLoS ONE 2012-05-15T00:00:00Z 1.119538 Does <i>S</i>-Metolachlor Affect the Performan...
17 [\\nDue to environmental persistence and biotox... Research Article [Yonggang Yang, Meiying Xu, Zhili He, Jun Guo,... 1932-6203 10.1371/journal.pone.0070686 PLoS ONE 2013-08-05T00:00:00Z 1.119538 Microbial Electricity Generation Enhances Deca...
34 [\\n Intensive use of chlorpyrifos has r... Research Article [Shaohua Chen, Chenglan Liu, Chuyan Peng, Hong... 1932-6203 10.1371/journal.pone.0047205 NaN 2012-10-08T00:00:00Z 1.119538 Biodegradation of Chlorpyrifos and Its Hydroly...
35 [Background: The complex characteristics and u... Research Article [Zhongbo Zhou, Fangang Meng, So-Ryong Chae, Gu... 1932-6203 10.1371/journal.pone.0042270 NaN 2012-08-09T00:00:00Z 0.989541 Microbial Transformation of Biomacromolecules ...
\n", "

5 rows \u00d7 9 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 8, "text": [ " abstract article_type \\\n", "7 [\\nThe objective of this paper is to assess th... Research Article \n", "16 [\\n Atrazine (ATZ) and S-metolachlor (S... Research Article \n", "17 [\\nDue to environmental persistence and biotox... Research Article \n", "34 [\\n Intensive use of chlorpyrifos has r... Research Article \n", "35 [Background: The complex characteristics and u... Research Article \n", "\n", " author_display eissn \\\n", "7 [Latifah Amin, Md. Abul Kalam Azad, Mohd Hanaf... 1932-6203 \n", "16 [Cristina A. Viegas, Catarina Costa, Sandra An... 1932-6203 \n", "17 [Yonggang Yang, Meiying Xu, Zhili He, Jun Guo,... 1932-6203 \n", "34 [Shaohua Chen, Chenglan Liu, Chuyan Peng, Hong... 1932-6203 \n", "35 [Zhongbo Zhou, Fangang Meng, So-Ryong Chae, Gu... 1932-6203 \n", "\n", " id journal publication_date score \\\n", "7 10.1371/journal.pone.0086174 PLoS ONE 2014-01-29T00:00:00Z 1.211935 \n", "16 10.1371/journal.pone.0037140 PLoS ONE 2012-05-15T00:00:00Z 1.119538 \n", "17 10.1371/journal.pone.0070686 PLoS ONE 2013-08-05T00:00:00Z 1.119538 \n", "34 10.1371/journal.pone.0047205 NaN 2012-10-08T00:00:00Z 1.119538 \n", "35 10.1371/journal.pone.0042270 NaN 2012-08-09T00:00:00Z 0.989541 \n", "\n", " title_display \n", "7 Determinants of Public Attitudes to Geneticall... \n", "16 Does S-Metolachlor Affect the Performan... \n", "17 Microbial Electricity Generation Enhances Deca... \n", "34 Biodegradation of Chlorpyrifos and Its Hydroly... \n", "35 Microbial Transformation of Biomacromolecules ... \n", "\n", "[5 rows x 9 columns]" ] } ], "prompt_number": 8 }, { "cell_type": "code", "collapsed": false, "input": [ "# Abstracts and authors are lists\n", "print type(articles.ix[7,0]), len(articles.ix[7,0])\n", "print type(articles.ix[7,2]), len(articles.ix[7,2])\n", "\n", "DataFrame([articles.abstract.apply(len), articles.author_display.apply(len)])" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ " 1\n", " 4\n" ] }, { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
71617343543585966798291100106115131132141158161
abstract 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...
author_display 4 6 6 6 8 3 2 24 7 7 3 11 2 8 3 4 3 8 6 11...
\n", "

2 rows \u00d7 60 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 9, "text": [ " 7 16 17 34 35 43 58 59 66 79 82 91 \\\n", "abstract 1 1 1 1 1 1 1 1 1 1 1 1 \n", "author_display 4 6 6 6 8 3 2 24 7 7 3 11 \n", "\n", " 100 106 115 131 132 141 158 161 \n", "abstract 1 1 1 1 1 1 1 1 ... \n", "author_display 2 8 3 4 3 8 6 11 ... \n", "\n", "[2 rows x 60 columns]" ] } ], "prompt_number": 9 }, { "cell_type": "code", "collapsed": false, "input": [ "# Are they all from PLoS ONE? ... We can tell by eISSN == 1932-6203 \n", "# ... http://www.plosone.org/static/information.action\n", "len(articles[articles.eissn == '1932-6203'])" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 10, "text": [ "54" ] } ], "prompt_number": 10 }, { "cell_type": "code", "collapsed": false, "input": [ "# If we care, we can fill in the missing journals based on the eISSN.\n", "articles[articles.eissn != '1932-6203'].ix[:,['eissn', 'journal']]" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eissnjournal
100 1545-7885 PLoS Biology
193 1549-1676 PLoS Medicine
208 1549-1676 PLoS Medicine
230 1549-1676 NaN
235 1549-1676 PLoS Medicine
240 1549-1676 PLoS Medicine
\n", "

6 rows \u00d7 2 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 11, "text": [ " eissn journal\n", "100 1545-7885 PLoS Biology\n", "193 1549-1676 PLoS Medicine\n", "208 1549-1676 PLoS Medicine\n", "230 1549-1676 NaN\n", "235 1549-1676 PLoS Medicine\n", "240 1549-1676 PLoS Medicine\n", "\n", "[6 rows x 2 columns]" ] } ], "prompt_number": 11 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cleaning up abstract texts without combining them all" ] }, { "cell_type": "code", "collapsed": false, "input": [ "import nltk\n", "from nltk.corpus import stopwords\n", "import string\n", "\n", "# Globally define a set of stopwords. We can add sciency stuff to it as well.\n", "stops = set(stopwords.words('english'))\n", "stops.add('conclusions') # just an example\n", "\n", "def wordify(abs_list):\n", " '''Take the abstract field from PLoS API and convert it to a filtered list of words.'''\n", " \n", " # Make it a string.\n", " text = ' '.join(abs_list).strip(' \\n')\n", " \n", " if text == '':\n", " return np.nan\n", " \n", " else:\n", " # Remove punctuation & replace with space, \n", " # because we want 'metal-contaminated' => 'metal contaminated'\n", " # ...not 'metalcontaminated', and so on.\n", " for c in string.punctuation:\n", " text = text.replace(c, ' ')\n", "\n", " # Now make it a Series of words, and do some cleaning.\n", " words = Series(text.split(' '))\n", " # aseries.apply(lambda s: s.strip()) # should be unnecessary: split should already do this.\n", " words = words.str.lower()\n", " words = words[words.str.len() > 1]\n", " words = words[~words.str.contains(r'[^#@a-z]')] # What exactly does this do?\n", " \n", " # Filter globally-defined stopwords\n", " ignore = stops & set(words.unique())\n", " words_out = [w for w in words.tolist() if w not in ignore]\n", " \n", " return words_out" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 12 }, { "cell_type": "code", "collapsed": false, "input": [ "# Test\n", "test_abstract = articles.ix[16, 0]\n", "wordify(test_abstract)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 13, "text": [ "[u'atrazine',\n", " u'atz',\n", " u'metolachlor',\n", " u'met',\n", " u'two',\n", " u'herbicides',\n", " u'widely',\n", " u'used',\n", " u'often',\n", " u'mixtures',\n", " u'present',\n", " u'work',\n", " u'examined',\n", " u'whether',\n", " u'presence',\n", " u'met',\n", " u'affects',\n", " u'atz',\n", " u'biodegradation',\n", " u'activity',\n", " u'bioaugmentation',\n", " u'bacterium',\n", " u'pseudomonas',\n", " u'sp',\n", " u'strain',\n", " u'adp',\n", " u'crop',\n", " u'soil',\n", " u'met',\n", " u'concentrations',\n", " u'selected',\n", " u'relevance',\n", " u'worst',\n", " u'case',\n", " u'scenarios',\n", " u'soil',\n", " u'contamination',\n", " u'commercial',\n", " u'formulation',\n", " u'containing',\n", " u'herbicides',\n", " u'concentrations',\n", " u'representative',\n", " u'application',\n", " u'high',\n", " u'doses',\n", " u'formulation',\n", " u'soil',\n", " u'corresponding',\n", " u'dose',\n", " u'approximately',\n", " u'higher',\n", " u'recommended',\n", " u'field',\n", " u'dose',\n", " u'rd',\n", " u'presence',\n", " u'pure',\n", " u'met',\n", " u'significantly',\n", " u'affected',\n", " u'neither',\n", " u'bacteria',\n", " u'survival',\n", " u'initial',\n", " u'viable',\n", " u'cells',\n", " u'soil',\n", " u'atz',\n", " u'mineralization',\n", " u'activity',\n", " u'consistently',\n", " u'biodegradation',\n", " u'experiments',\n", " u'larger',\n", " u'soil',\n", " u'microcosms',\n", " u'spiked',\n", " u'double',\n", " u'formulation',\n", " u'inoculated',\n", " u'bacterium',\n", " u'revealed',\n", " u'atz',\n", " u'rapidly',\n", " u'days',\n", " u'extensively',\n", " u'removed',\n", " u'soil',\n", " u'days',\n", " u'concentration',\n", " u'met',\n", " u'decreased',\n", " u'moderately',\n", " u'initial',\n", " u'inoculated',\n", " u'non',\n", " u'inoculated',\n", " u'microcosms',\n", " u'concomitantly',\n", " u'accumulation',\n", " u'two',\n", " u'metabolites',\n", " u'met',\n", " u'ethanesulfonic',\n", " u'acid',\n", " u'met',\n", " u'oxanilic',\n", " u'acid',\n", " u'found',\n", " u'despite',\n", " u'dissipation',\n", " u'almost',\n", " u'atz',\n", " u'treated',\n", " u'soils',\n", " u'respective',\n", " u'eluates',\n", " u'still',\n", " u'highly',\n", " u'toxic',\n", " u'aquatic',\n", " u'microalgae',\n", " u'species',\n", " u'toxic',\n", " u'untreated',\n", " u'soil',\n", " u'suggest',\n", " u'high',\n", " u'toxicity',\n", " u'may',\n", " u'due',\n", " u'met',\n", " u'metabolites',\n", " u'remaining',\n", " u'soil']" ] } ], "prompt_number": 13 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Applying it to the whole DataFrame of articles" ] }, { "cell_type": "code", "collapsed": false, "input": [ "articles['words'] = articles.apply(lambda s: wordify(s['abstract'] + [s['title_display']]), axis=1)\n", "articles.drop(['article_type', 'score', 'title_display', 'abstract'], axis=1, inplace=True)\n", "articles.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
author_displayeissnidjournalpublication_datewords
7 [Latifah Amin, Md. Abul Kalam Azad, Mohd Hanaf... 1932-6203 10.1371/journal.pone.0086174 PLoS ONE 2014-01-29T00:00:00Z [objective, paper, assess, attitude, malaysian...
16 [Cristina A. Viegas, Catarina Costa, Sandra An... 1932-6203 10.1371/journal.pone.0037140 PLoS ONE 2012-05-15T00:00:00Z [atrazine, atz, metolachlor, met, two, herbici...
17 [Yonggang Yang, Meiying Xu, Zhili He, Jun Guo,... 1932-6203 10.1371/journal.pone.0070686 PLoS ONE 2013-08-05T00:00:00Z [due, environmental, persistence, biotoxicity,...
34 [Shaohua Chen, Chenglan Liu, Chuyan Peng, Hong... 1932-6203 10.1371/journal.pone.0047205 NaN 2012-10-08T00:00:00Z [intensive, use, chlorpyrifos, resulted, ubiqu...
35 [Zhongbo Zhou, Fangang Meng, So-Ryong Chae, Gu... 1932-6203 10.1371/journal.pone.0042270 NaN 2012-08-09T00:00:00Z [background, complex, characteristics, unclear...
\n", "

5 rows \u00d7 6 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 14, "text": [ " author_display eissn \\\n", "7 [Latifah Amin, Md. Abul Kalam Azad, Mohd Hanaf... 1932-6203 \n", "16 [Cristina A. Viegas, Catarina Costa, Sandra An... 1932-6203 \n", "17 [Yonggang Yang, Meiying Xu, Zhili He, Jun Guo,... 1932-6203 \n", "34 [Shaohua Chen, Chenglan Liu, Chuyan Peng, Hong... 1932-6203 \n", "35 [Zhongbo Zhou, Fangang Meng, So-Ryong Chae, Gu... 1932-6203 \n", "\n", " id journal publication_date \\\n", "7 10.1371/journal.pone.0086174 PLoS ONE 2014-01-29T00:00:00Z \n", "16 10.1371/journal.pone.0037140 PLoS ONE 2012-05-15T00:00:00Z \n", "17 10.1371/journal.pone.0070686 PLoS ONE 2013-08-05T00:00:00Z \n", "34 10.1371/journal.pone.0047205 NaN 2012-10-08T00:00:00Z \n", "35 10.1371/journal.pone.0042270 NaN 2012-08-09T00:00:00Z \n", "\n", " words \n", "7 [objective, paper, assess, attitude, malaysian... \n", "16 [atrazine, atz, metolachlor, met, two, herbici... \n", "17 [due, environmental, persistence, biotoxicity,... \n", "34 [intensive, use, chlorpyrifos, resulted, ubiqu... \n", "35 [background, complex, characteristics, unclear... \n", "\n", "[5 rows x 6 columns]" ] } ], "prompt_number": 14 }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Colin's NLP stuff\n", "\n", "I substituted the variables Colin used with the data derived from the `articles` DF.\n", "\n", "If we use this method, it might be better to rewrite the NLP and cloud-generating code such that we are not interconverting among data structures so frequently (DF, list, string, ...)." ] }, { "cell_type": "code", "collapsed": false, "input": [ "abs_df = DataFrame(articles['words'].apply(lambda x: ' '.join(x)).tolist(), columns=['text'])\n", "abs_df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
0 objective paper assess attitude malaysian stak...
1 atrazine atz metolachlor met two herbicides wi...
2 due environmental persistence biotoxicity poly...
3 intensive use chlorpyrifos resulted ubiquitous...
4 background complex characteristics unclear bio...
\n", "

5 rows \u00d7 1 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 15, "text": [ " text\n", "0 objective paper assess attitude malaysian stak...\n", "1 atrazine atz metolachlor met two herbicides wi...\n", "2 due environmental persistence biotoxicity poly...\n", "3 intensive use chlorpyrifos resulted ubiquitous...\n", "4 background complex characteristics unclear bio...\n", "\n", "[5 rows x 1 columns]" ] } ], "prompt_number": 15 }, { "cell_type": "code", "collapsed": false, "input": [ "abs_set_df = DataFrame(articles['words'].apply(lambda x: ' '.join(set(x))).tolist(), columns=['text'])\n", "abs_set_df.head()" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
text
0 among developed attitude paper identify accept...
1 aquatic mineralization dose experiments still ...
2 mfc hypothesized distinctly results nitrogen s...
3 fungal contaminant tcp accumulative gc morphol...
4 origin humic mineralization show mainly result...
\n", "

5 rows \u00d7 1 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 16, "text": [ " text\n", "0 among developed attitude paper identify accept...\n", "1 aquatic mineralization dose experiments still ...\n", "2 mfc hypothesized distinctly results nitrogen s...\n", "3 fungal contaminant tcp accumulative gc morphol...\n", "4 origin humic mineralization show mainly result...\n", "\n", "[5 rows x 1 columns]" ] } ], "prompt_number": 16 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Common word pairs\n", "\n", "This section uses all words from abstracts to find the common word pairs." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#include all words from abstracts for getting common word pairs\n", "words_all = pd.Series(' '.join(abs_df['text']).split(' '))\n", "words_all.value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 17, "text": [ "study 56\n", "using 33\n", "two 32\n", "patients 31\n", "biodegradation 30\n", "non 29\n", "data 28\n", "three 28\n", "analysis 27\n", "compared 27\n", "soil 27\n", "new 27\n", "results 26\n", "species 25\n", "cell 25\n", "...\n", "engage 1\n", "thermal 1\n", "geochip 1\n", "dominant 1\n", "suggests 1\n", "third 1\n", "usually 1\n", "locomotion 1\n", "rpos 1\n", "scales 1\n", "prefer 1\n", "quite 1\n", "protocatechuate 1\n", "routine 1\n", "agr 1\n", "Length: 3028, dtype: int64" ] } ], "prompt_number": 17 }, { "cell_type": "code", "collapsed": false, "input": [ "relevant_words_pairs = words_all.copy()#.str.lower()\n", "#relevant_words_pairs = relevant_words_pairs[~relevant_words_pairs.str.contains(r'[^#@a-z]')]\n", "#relevant_words_pairs = relevant_words_pairs[relevant_words_pairs.str.len() > 1]\n", "#ignore = set(stopwords.words('english')) & set(relevant_words_pairs.unique())\n", "relevant_words_pairs.value_counts()#.drop(ignore)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 18, "text": [ "study 56\n", "using 33\n", "two 32\n", "patients 31\n", "biodegradation 30\n", "non 29\n", "data 28\n", "three 28\n", "analysis 27\n", "compared 27\n", "soil 27\n", "new 27\n", "results 26\n", "species 25\n", "cell 25\n", "...\n", "engage 1\n", "thermal 1\n", "geochip 1\n", "dominant 1\n", "suggests 1\n", "third 1\n", "usually 1\n", "locomotion 1\n", "rpos 1\n", "scales 1\n", "prefer 1\n", "quite 1\n", "protocatechuate 1\n", "routine 1\n", "agr 1\n", "Length: 3028, dtype: int64" ] } ], "prompt_number": 18 }, { "cell_type": "code", "collapsed": false, "input": [ "from nltk.collocations import BigramCollocationFinder\n", "from nltk.metrics import BigramAssocMeasures\n", "\n", "bcf = BigramCollocationFinder.from_words(relevant_words_pairs)\n", "for pair in bcf.nbest(BigramAssocMeasures.likelihood_ratio, 30):\n", " print ' '.join(pair)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "synthetic biology\n", "spider silk\n", "es cell\n", "adjacent segment\n", "medical imaging\n", "dp dtmax\n", "security privacy\n", "industry backgrounds\n", "removal initiation\n", "uv irradiated\n", "gm salmon\n", "persistent crsab\n", "antimicrobial therapy\n", "limb amputation\n", "cellular phone\n", "wireless powered\n", "minimally invasive\n", "phone technology\n", "heavy metals\n", "battery powered\n", "composite mesh\n", "frequency currents\n", "genetically modified\n", "tissue engineering\n", "catheter removal\n", "acting reversible\n", "brassica napus\n", "brown streak\n", "quasi stiffness\n", "data code\n" ] } ], "prompt_number": 19 }, { "cell_type": "code", "collapsed": false, "input": [ "#these are the most common paired words\n", "bcf.nbest(BigramAssocMeasures.likelihood_ratio, 20)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 20, "text": [ "[(u'synthetic', u'biology'),\n", " (u'spider', u'silk'),\n", " (u'es', u'cell'),\n", " (u'adjacent', u'segment'),\n", " (u'medical', u'imaging'),\n", " (u'dp', u'dtmax'),\n", " (u'security', u'privacy'),\n", " (u'industry', u'backgrounds'),\n", " (u'removal', u'initiation'),\n", " (u'uv', u'irradiated'),\n", " (u'gm', u'salmon'),\n", " (u'persistent', u'crsab'),\n", " (u'antimicrobial', u'therapy'),\n", " (u'limb', u'amputation'),\n", " (u'cellular', u'phone'),\n", " (u'wireless', u'powered'),\n", " (u'minimally', u'invasive'),\n", " (u'phone', u'technology'),\n", " (u'heavy', u'metals'),\n", " (u'battery', u'powered')]" ] } ], "prompt_number": 20 }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Unique words from each abstract\n", "\n", "This takes only unique words from each abstract." ] }, { "cell_type": "code", "collapsed": false, "input": [ "#following http://nbviewer.ipython.org/github/sanand0/ipython-notebooks/blob/master/Text-analysis.ipynb\n", "#only includes a set() of words from each abstract\n", "words = pd.Series(' '.join(abs_set_df['text']).split(' '))\n", "words.value_counts()" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 21, "text": [ "study 38\n", "two 23\n", "using 21\n", "results 20\n", "three 20\n", "analysis 20\n", "compared 17\n", "used 16\n", "higher 16\n", "may 16\n", "non 15\n", "based 15\n", "significantly 14\n", "also 14\n", "however 14\n", "...\n", "septal 1\n", "recommendations 1\n", "genomes 1\n", "poking 1\n", "gck 1\n", "optimised 1\n", "varied 1\n", "counting 1\n", "monitoring 1\n", "malware 1\n", "tmc 1\n", "rape 1\n", "occur 1\n", "conversely 1\n", "cda 1\n", "Length: 3028, dtype: int64" ] } ], "prompt_number": 21 }, { "cell_type": "markdown", "metadata": {}, "source": [ "### There is no longer any need for dropping stopwords here\n", "\n", "That was done in the function `wordify()`" ] }, { "cell_type": "code", "collapsed": false, "input": [ "top_words = words.value_counts().reset_index()\n", "top_words.columns = ['word', 'count']\n", "top_words.head(15)" ], "language": "python", "metadata": {}, "outputs": [ { "html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
0 study 38
1 two 23
2 using 21
3 results 20
4 three 20
5 analysis 20
6 compared 17
7 used 16
8 higher 16
9 may 16
10 non 15
11 based 15
12 significantly 14
13 also 14
14 however 14
\n", "

15 rows \u00d7 2 columns

\n", "
" ], "metadata": {}, "output_type": "pyout", "prompt_number": 23, "text": [ " word count\n", "0 study 38\n", "1 two 23\n", "2 using 21\n", "3 results 20\n", "4 three 20\n", "5 analysis 20\n", "6 compared 17\n", "7 used 16\n", "8 higher 16\n", "9 may 16\n", "10 non 15\n", "11 based 15\n", "12 significantly 14\n", "13 also 14\n", "14 however 14\n", "\n", "[15 rows x 2 columns]" ] } ], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "word_list = list(top_words.word)\n", "word_str = ' '.join(word_list)\n", "print word_str" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "study two using results three analysis compared used higher may non based significantly also however present methods rate well respectively associated model new time gene suggest background high important one data found showed process potential biodegradation performance development activity degradation enhanced could aim lower genes presence different species bioremediation major days available use devices revealed determine environmental studies type increased engineering method growth including level significant demonstrated specific observed due objective biological within following increase whether patient provide genetically cell protein medical research identified device patients health current applications public tissue transgenic evidence possible materials findings ability molecular improve long conducted small efficient identify large rates engineered quality clinical design information concentrations controls reduce selected number methodology impact developed work among derived control common affect demonstrate years novel application soil assessed expression strain role order analyses natural term treated environment designed techniques microbial biomass soils period resistance selection outcomes first inoculated candidates trials field carbon waste surface wild make ph reduced cellular measured incidence included follow performed synthetic system addition six characteristics needed animal essential optimal independent treatment generation primary concentration thus factors involved particularly products technologies commercial relative changes greater function evaluate cells systems furthermore relevant uptake better excellent remain towards known culture effective introduce support many contaminated proteins ci exposure point genetic article vs strains life influence recent response implant conditions models multi according adhesion pcr often plants failure appropriate samples strategies approach without regulation improvement would help dna initial processes reduction production bacteria effects morphology network correlation show surgery behavior highly moreover sources tested effect least alternative mainly prosthetic interactions per body central randomized criteria native report biomaterial nature bacterial describe general frequency determined risks factor shown enzyme therefore functional established regulatory sp lines allow approximately little mechanisms modified provided obtained lead outcome animals understanding biology water electron future evaluating persistent values half desired critical material environments introduction degrade accumulation transformation low contamination day invasive address useful example region co benefit still genome networks platform larger wt requires test technical regeneration isolate minimal result indicated metals measurement heart making case scanning countries respect reference pollutant toxic achieved significance microscopy formed ii whereas likely overexpression screening various procedure product concept containing currently comparative capacity enzymes open best mechanical year groups principal dimensional community limited size applied authors laboratory collagen since determination ratio crop positively double investigated several substrate structural times basis include benefits indicating linked focus features algal efficacy structures induced disease participants population content databases pathogens acid weight key ideal objectives compounds strategy resistant improved detected neither related investigation lack though implants care efficiency elements treatments sequence growing isolates corresponding particular argue experimental organic contribution kinetics diagnostic fluorescence chromatography independently therapy hydrocarbons exhibited suggesting dioxide play review similar required atrial construct successfully oxidation functions researchers alone introduced source metabolic resulting variety aromatic mice less site register stable standard induction human success mutants activities reported differences resulted component science ranged levels difference power provides contrast formation capable fate contacts scaffolds previously increasing removed readily vivo risk confirmed comparing pollutants issue rapid energy become month either facilities us crucial controlled implemented structure assess much form grow anterior studied followed conclusion step directly synthesis meet cultured specifically reports real employed consistently additional area rat dioxygenase wastes polycyclic full red experiment cause part although accumulate comparison reductase fold supplies completely toward genus organisms implantable supplied types circuitry inductive regulator place qrt perspective processed supply predict post multiple cultures modern appeared verified intense index loads height practice week proposed generate marine nerve industrial remained investments regardless conserved published acceptance demonstrates extensively publishing need experience journal computational suitable catalyzed nevertheless crops social concerns stiffness equipment majority mechanism created generated studying bacterium minimally people another perceived described observations optimized woven toxicity recovered assembly clearly scientific stroke concluded intermediate zn stimulation centered residual hours fibrillation met differ complication require electronic excess transmission wide temperature along software simulation manipulation fibres engagement protocol transfer sole yet consecutive cardiac physical ray dakota hazard removal modifications market degrading profile confidence personnel pathway advances intracellular web decrease relatively stress rise composed thresholds affinity papers seeded inducing healthcare availability degeneration indicator establishment parameters highest properties vital positive participate scenarios approaches hydrolysis degradable literature mass individual statistical collectively set occurred mineralization arthroplasty cases represent dry publications release polymers pathways dw untreated united poor fusion involving phenanthrene four additionally cochrane degraded sem phosphorus stimulating induce produced random change regarding suggested monitored north ring locally explore reducing virus oxidative promote articles modeling implanted biodegrading effectiveness needs specimens proliferation examined matrix depending supporting self guidelines previous miniature extracted ex sand remote subjects chose solution polyethylene involves leading weak commonly suboptimal scaffold isolated distribution group supplemented worst mentioned coli statistically rna develop experiments dynamics condition limit database made affects observation electrical dam great clear policy quantitatively metal describing scale salinity decreased participation range sophisticated adult arising assessment importance manufacturers dawley medium lowest catheter membrane aid metabolites sites definite error md repair brassica components language cervical single lipophilic reaction analyse circuit pacing parts plant drug enabling wireless line might collaboration question empirical fully summary meta instructions easily simple versus reliable resource density interval areas absorption yields cleavage adapted rare died framework evaluated humans medicines napus namely broad postoperative successful affected good access specificity streams spine mesh regression widely advice designing enhancing stem forms generally laccase enzymatic across bioassay diversity cross correlated developing goal constant parallel mesenchymal pacemaker promising arabidopsis recently damage adjacent mutant terms pahs prevalence already embase therapeutics equation driven enrichment maintained cd injection acdf wall alkyl canola comprehensive frequently ms biocompatibility strongest sprague mixed microorganisms states indicates cv maintain tolerance complexity coal pah age electrodes foods minimum characteristic ablation fired together identification total intensive like funding extracellular hydrocarbon free knowledge mean pure finally threshold enhance sub extends technique scarce drugs measurements overcome five innovative chain protocols nutrient via mortality hospital systematic hosts biodegradable despite gm series gas macroalgae evaluation fibre oxidized cultivation collected bioreactors biomaterials layers survey redox diseases accepted concomitantly retention emerging categories regulated pain target ash algae biosorption capabilities foundation arrhythmia overall issues international qualities complex imaging morphological discectomy detection amended composition mouse eastern chitosan remediate examiners escaped propulsive impacting immobilize electrocardiograph sufficed concerning echelon decontamination cautiously peritoneum east left lilacs temporary remaining tolerances unable acutely covary consisting speciation harvesting atrazine halophilic create institutions female comprise regulating machinery numerous il score automated emergency multiplexing maintenance referent mixtures enrolled optimum emphasize things chemistry promise genomic ethers recurrent abundant potentially makers transcriptional fast degradations texturizing elevated pn simulated schizophrenia wherein draws sterilization unique egfp quantitative potent fungal indicative thereby citations currents quickly corporate libraries coated fresh artificial tumors commission grown chloride analytics diode rival survival patent pacemakers distinctive firm validity initiation motif baseline reframing oriented fouling depends publicly tendentially vegetable ploscollections microarray fibrils around slips alignment percentages larc counts accessing medically pcah abnormality biologically spontaneously ca hypothesized heating moratorium ffrs almost neat accidental daily paradigms vectors clay patented way prices demand microelectronic failing remarkable linkage kept friendly inoculum whole exponential maize phytic aroma ohnoi likert hand load intervals ligand incubation restrictions crw col loam fmn logic receives texture improves connection microalgae programs developments niger antiretroviral reveals replaces surrogate foreign representative candidate eps sirnas widespread abilities reproducibility ir element outside contraceptive ether seed expensive pcag traditional moment root dysphagia microcosms unmatched initially location sydney technological typical kit differing relevance numerical decline land herein eacmv respiration belongs transcripts pentaromativorans readers robust irradiated congruently workers balance oxyr naphthalene promoting vesicles signals physiomeshtm suite integrated agents wider user adaptation competent uses immunolabelling ranked saharan interactive biotech immunologic upgrades gap utilizes seven augmented fibrous sensitivity cassava sense prescribed dioxygenases versatile tendons validates normal sharp rootworm sensu chlorinated dutch plateaued compare reply cultivate taken wovens centrally chromosomal adherence occasionally polybrominated representatives elicited validated end respective influenced providing salts crystallized hybridization reviews specialized substances filamentous fischer emission symptoms bursts tensile demilitarization male staphylococcus sbstta fractions snap entails genotypes corn redberg selective aligned world supplemental must abundance mg amputation debates mm greatly valid temperatures learn plays detonation modify others paper roles postoperatively configuration polyaromatic kinetic viral attitude cofactors loose dichotomous ffr acting remnants infant competitive rapidly chlorpyrifos composite highlights expectations da reuters lasting persist stifling weeks biogenic osteogenic emulate food propose discuss sets encoded drenthe invasion remediation conclusively electroporation polymerase fungus lab athletes mapping lag sustainable prognostic lvpcs ubiquitous concentrate edodes decontaminated prostheses redonning third sports spectra visceral de perfectio hypotheses algorithms ultimate deterioration methoxylation dialogue combinations linear tended bred deficient severely medline neurological origin residues references nih transcriptome replication landrace activation depend far skeletal erythropolis archives phytate inactivating tnt automatic rats phrenic pubmed programme hydroxybenzoate african modes incentives patents vector persisted updated ucbsv endogenous yellow religious pluripotency consists march systemic surgical compatible tg dynamic ald became founder influenza flavin assembled causes responsiveness guaiacol feeding calculate eligible electricity pretreatment indirectly disability blamed hospitalized susceptible telemedicine bond decolorization outer direct feasibility hydrophobic biomechanical subject requiring transplantation family evolution pathological changed trinitrotoluene corrodens biotransformation amount prior impeding africa diabrotica utilisation transparency trained superoxide armed landscape conventional hydroxylation takes macroalgal distinct mediated modification ethyl farmer traits identifiable discriminated preferences walking attitudes bipolar creation revolutionize pesticidal filtration organization errors even investment infections rigorous expressions reconstruction phdf engineer phytase screened lipophilicity dhruva metallidurans compiler protection biomacromolecules bioremediate active strong detoxification publisher embryonic safe caused blastocysts diphenyl originally vascular zealand establish explosives approved problematic battery phase powering letter lvpc experiences stability mobile humanitarian underserved tuber phenomenon humic joints polypropylene able fitted quantifying camv velocity consuming systematically claim dissolved persists ros assesses acidovorans polar orders odors regulate seem seen agrobacterium pair salmon raw defect stakeholder characterisation ran brominated affairs varying unprocessed convenient responsible accumulative recommended absorbed build asd biotoxicity valuable remarkably points incentive adhesions lentinula rdna injectable reductions silk resolve crp infrequent collection societies dp dechlorination multiserver breast laccases intraindividual homoeologs interrater yielded barcoding industries crt nutrition facilitators bag investigate prescribing spiked originated freshwater explanations steam bulky enriched experienced clarified variable war phdi visualisation denser thickness analogue round digested acceptability bioavailability express biodegrade soluble proven evolutionary relied meetings added guidance introgression distributed short alizarin hominis maturation biotechnological adequate moderately launched silt sulfonic fertile compiled haemophilus iyawo motivation equivalent inducer complications fabrics occurs humification instead dsp induces barrier quantify bdps rr exploring contend event telecommunication arthoplasty categorizes unavailable tolerant exploiting trilobal inexpensive immunogenic sterility estimate commercially backed pseudomonas thematic excursion trichloro sensory aspergillus operation doses hydroxylating labeling exceeded backgrounds ommeren potatoes echocardiography oxygen expect remove stromal weedy ablations communities oxidant reached capability catabolic inbred interestingly bone heavy afforded financial reflect spider olfaction contact complexes seedling eikenella coefficients landraces iso threat serve brevibacterium hydroperoxide lysinibacillus townsville somewhat immediate ncs ranging tool bis relationship option invasively hexaploid grew critically polysaccharides cultivar arthrodesis nigerian landfill always nations matter flame congestive rectifiers tryptophan concentrated acquisition differentiated exposures percentage consequence proteomics endocarditis incubated utilized camp nacl rarely omv rethink death speeds thermotolerance atz characterizing spp hormonal bland identifies dismutase capture prohibitively vitro herbicides impairing utilization grade interviews threatening immune checkpoint station glucocorticoid precipitation delftia advanced phosphate fourier publishers affordable belonging country starch carrying reproducible closed determines tend pro subsequent federal inhibitory explosive agencies phases ps westar conserve klang genorm taking career limits perceptions blade declining come multivariable analyzed bioprocessing latter detect virgin uniformly infiltrated trivialization malaysian differential firmly settings faceted closure procedures sinus laminar subset sexually distinctly fibrillogenesis amplitude yield unclear mononucleotide contain bacteriophage aquatic adoption familiarity accompanied oko highlighting national maximal actin predictive view econometric seeking specification manifestations comment avb ongoing hacek transmit value lives kingella secundum surprisingly incorporating quantigene spirit snowflake compromised combining confers healing crsab fabricated bmm perform phenoxyl partner nonbrowning polyester optimistic abts hardware operating encouragement impacts speed examines layer contribute mates multipotent inversely cancer therapies evidences provinces amino latency cp viruses ante saccharomyces compilation occluder strength intermittently cloning antioxidant occluded cycling acute april ethylbenzothiazoline rt disc justified unprotected heteroatoms infective mobitz personal slowly biodegradability polystyrene mark thousand dsrnas diameter stay joint peritoneal unlikely spread methicillin curves locations resynchronization correction brisbane trial investors quartz intrauterine addressing grafting behaviors proactive enjoyable interrupted negatively inoculation persistence competence characterized faster online optimizations transcribed overcoming images reactions instrumentation overexpressing stricto viewed superfamily early main familiar prosthesis mnp continuous feral shelf thomson urgent standardized receiver transgene spanning mining rsm gives pathogen solvent released enzymology assays reform mechanics mcid electrically transferred browning ionization coastal tailor unfortunately surveyed colleagues steel updating physicochemical preserve eem chrr trace proportion segments conduction title slow debate expansins loamy enhances preferred surgically inferences lac fifty learnt driving combination manufactured deploy palustris net means metalloids degradability stance discharge hope local crimp sulfur fuel opportunity molecules regulators reactive contaminant german reversible asds aquaculture utilise escherichia transcript agreed hydrogen usually eight polyvinylidene tte oilseed younger bioaugmentation catalase pyr meshed renal transcription propels impression transparent isolation typically senior seroma liquid considered select challenged pores implications gather average technology afterwards toluene naturally valley vas section detectable qof enzymatically displayed normfinder promoter van collagens pharmaceutical pvdf intervention sensing appa optimize lively complete applying tolerated count polyploidy irreversible impedance convergence fed hierarchical hu societal magnitude hs fundamentals triggered arm chronic embedded informing malaysia industry freedom carried periodically triplicate host epidemiology notably expansion analysed numbers selecting variances limitations replacement roadside dataset macrophage attention numerically functionality warrants communication implantation inclusion couplers researching pastoris executives additives trypsin act stand laparoscopically pyridinol cohort pool biosis metabolized pichia june jude therapeutic ventralighttm seeding anesthetized germline immunity illustrate antimicrobial soxr spatial proteomic building stringent geographic resolution validation biotechnology questionnaires missouri tenocytes ulva inhibited computers transmissibility acids odds bias contraceptives clinically antiarrhythmic cleanup injectables dissemination connected inadequate mycobacterium accentuated strongly reusing display rather spectrometry pooled lignin catalyzes converted efforts interventional al causing exoskeletal beats illinois storage internet ge define anaerobic hospitals stainings commentators providers formulation peptide periods notion reliability chains necessitating sirna kingdom prediction divergence msc nd latest dimension germination biosciences patterns capsules quinone microbes programmable promoted sport increasingly displays tracking domestic metolachlor testing events ratios untested differentiation estimation predicts default details underlying batteries dose tcp obtain center relies expressing determinants document facepiece geographical signal oligomeric adverse surveillance administration proper began media diets proto productivity anchored infer cmd diabetes status mosaic organized mfc chimeric discouraged intended decabromodiphenyl structurally reporting acda strengths contraception extended cladosporium olfactory weekly dimethoxyphenol adw cardiopulmonary find exposed classical chromatopraphy governance poly accessible carboxylate works negative dense neurologic annually methodological metabolite underwent kda endocardial competition tme pioneer uraninite cumene markers dysphonia infection harbors colloidal achieves characterization organizing tetramer home bioenergy qualitative nutrients strikingly necessary tests resolved stakeholders match plos utility attractive add arrest dyes pcl pups adp context measuring fluctuating dimers government semi heteroresistance loss diffraction transmits connective cerevisiae receptor biologists preference novosphingobium increases dark tendon bioreactor white evoked convention absolute agr interact effluents serious phone sectors detecting dsrna bioethical rescued potato longer restrained smp bibliographies healthy code dissipation edk monogastric parafac disorders nanopod inverse prospectively bioconcentration kate adjustment biphenyl gc themes existing global monsanto neuroscience transform abdominal tufa pbdes green loosening quarters tentative epidemiological hemodynamic ni constructed neoformed discovery collections adversely tomography barriers preventing setting text degenerative assayed symptomatic update problems continue beneficial lend excitation users powered draw fibroblasts xenobiotics coat homoeologous simulations putative tissues weaving psychotropic electrophysiology stressful extradiol bde cannot leakage munitions dried video cost divergences pore aims rd insect assisted illustrating marketed homologous ionic degree led organs inconsistencies image firms shared composites capital perspectives lessons expressed iuss occurrence smaller st top substrates polymeric stranded flowering emergencies methylimidazolium advantage presently assay confirmation division colonise suffered rhythm electrophysiological minute cover upper chromate dimer mahs filtering action solved geminivirus fewer carbonyl practical counselled respirators gilvum earthworm vegetations respiratory policies units diverse modality disappeared processing odor epicardial markedly un transthoracic epigenetic predictors cardiobacterium disinfecting eluates innovation wheat eliminate intraoperative decorin microwave nontechnical giving close exhibit dynamically adipose firing reuse incorporated limitation mbrs quadripolar manufacturing ugandan dtmax uranium contributing sural quantification immunofluorescence mathematical fluoride security popular liquids indication trigger collect consensus neuropathy facets phytoremediation cladosporioides disrepair intraclass www old dominated phytoremoval undetectable elongation polyploid delivered exuded oxanilic pay relatives targets escape ram pyrene understood interplay mineral block raised behaving crystal accuracy uv personally extremity azo consist aggregatibacter fda raising foulants subsidiary binding covered cbsd rabbits maximum intrarater mineralisation exploration mesothelium sd abstract defined computing adults seeds originating phantom developmental stages interactional nearly mature verification incorporation needles nanopods protected plla represented mbr bags frames obligatory ground australia tcps biocatalysts iv bound ppo flow civil quasi interaction ankle metabolism brown clinics offering interagency considerably shift sections deliverable conceivable lip forward transportation enables reinvestment unobserved rhodococcus focused mode gait months exceeding january cbsv purified analytical radicals orfs groningen matched presidential males generic documented preceding mfcs plasmid predilection challenges abnormal catechol recalls rnas prompted sanguinis thin bridging mitral individually ready repaired naturalization synbio impaired ethanesulfonic innocuous normative exclusion superior observational operative sacrificed diol agricultural nitrogen episodes course facilitated inch absence lowering vancomycin enforcement agouti environmentally hypothesize searching calculated measure ft dad cupriavidus sensorimotor manganese legs choose rnai azino lv labor av society fibronectin orthoses striking thermostability tehran prospective blind excitable stimulate appropriateness workbook involve reoperation rita tension motion corroborating pandemic presented distal accumulated variables attack indicate ventricular structured privacy rcts management phya array af contour consumed return evolved believed programming sanket determining caprolactone patenting pocd formula rpob contained streak heterologous interference valve miniaturization postprocedure safety sncv es karyotype viable shows ecg except routine derivation showing retrospective compositions neo org architecture besides advantages possibility reality pathogenic theory obvious sf pbde promises introducing ligninolytic reflects polyneuropathy resuscitation msf films reductive interface duplicate mw next bottleneck possess appropriately sharing upcoming usa postimplant account feed women identifying playing rabbit virulence transient visual connectivity visibility vesicle reporter acceptable withstand intensively biomedical lined atrioventricular viability macroporous suggests stiffnesses dominant percutaneous geochip mo thermal engage box deduced physiology counselling cardiovascular locomotion rpos individuals rhodopseudomonas standing monocyclic tools ldpe phenolic limb xrd binomial protocatechuate quite ipo prefer mscs peroxidase scales manipulations man catalytic searches explored icc pressures accessory aureus waters improving altman hairpin map stock filament parameter date assistive hplc journals challenge pleasantness cosmopolitan robustness infra iuds unit diabetic ie oedogonium coculture dragline neck xylanilyticus defects efficiently prone osteoblasts flavodoxin bacteremia chemical effectively purposes populations deposition stream silkworm twenty proof postmarket segment determinates contaminations stored lc behaviour spondylosis allowed involvement hydrophilic septal recommendations genomes poking gck optimised varied counting monitoring malware tmc rape occur conversely cda\n" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "# Note the changed filename\n", "top_words.to_csv('../wordcloud2.csv', index=False)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 25 } ], "metadata": {} } ] }