{
 "metadata": {
  "name": "",
  "signature": "sha256:fb602b8501cfd3896a691c06e390c63d56d5fd9d64a5cf07fc2057c44adecd4e"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "from __future__ import unicode_literals\n",
      "import json\n",
      "import numpy as np\n",
      "import pandas as pd\n",
      "from pandas import DataFrame, Series\n",
      "import xlrd\n",
      "from collections import defaultdict"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "# Transforming the PLOS thesaurus\n",
      "\n",
      "The PLOS thesaurus was kindly provided to us as a spreadsheet with thousands of rows, one node per row. It is a polyhierarchy represented in the form of a tree. We need to transform it into a JSON object that also includes article counts for all the nodes in the tree.\n",
      "\n",
      "An example of the desired data structure for PLOS thesaurus:\n",
      "\n",
      "```\n",
      "{\"count\": #total, \"name\": \"PLOS\", \"children\": [\n",
      "     {\"name\": \"Computer and information sciences\",\n",
      "      \"count\": ###,\n",
      "      \"children\": [\n",
      "            {\"name\": \"Information technology\",\n",
      "             \"count\": ###,\n",
      "             \"children\": [\n",
      "                {\"name\": \"Data mining\", \"count\": ###},\n",
      "                {\"name\": \"Data reduction\", \"count\": ###},\n",
      "                {\"name\":  \"Databases\",\n",
      "                  \"count\": ###,\n",
      "                  \"children\": [\n",
      "                    {\"name\": \"Relational databases\", \"count\": ###}\n",
      "              ]\n",
      "            },\n",
      "            ...,\n",
      "            {\"name\": \"Text mining\",\"count\": ###} \n",
      "          ]\n",
      "        }\n",
      "      ]\n",
      "    }, \n",
      "    ...\n",
      "  ]\n",
      "}\n",
      "```\n",
      "\n",
      "In Python, each node is a dict. Children are specified as a list of dicts. The whole thing is a list of nodes, therefore, a list of dicts."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Import article data\n",
      "df = pd.read_pickle('../data/all_plos_df.pkl')\n",
      "\n",
      "# Drop unused data\n",
      "df.drop(['author', 'title_display', 'journal', 'abstract', 'publication_date', 'score'], axis=1, inplace=True)\n",
      "df.set_index('id', inplace=True)\n",
      "df.head()"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "html": [
        "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
        "<table border=\"1\" class=\"dataframe\">\n",
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
        "      <th>subject</th>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>id</th>\n",
        "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>10.1371/journal.pone.0008858</th>\n",
        "      <td> [/Biology and life sciences/Biochemistry/Prote...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10.1371/journal.pone.0004722</th>\n",
        "      <td> [/Biology and life sciences/Cell biology/Cellu...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10.1371/journal.pone.0076865</th>\n",
        "      <td> [/Biology and life sciences/Biochemistry/DNA/D...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10.1371/journal.pbio.0040157</th>\n",
        "      <td> [/Research and analysis methods/Research asses...</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10.1371/journal.pone.0080851</th>\n",
        "      <td> [/Biology and life sciences/Biochemistry/Prote...</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "<p>5 rows \u00d7 1 columns</p>\n",
        "</div>"
       ],
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 9,
       "text": [
        "                                                                        subject\n",
        "id                                                                             \n",
        "10.1371/journal.pone.0008858  [/Biology and life sciences/Biochemistry/Prote...\n",
        "10.1371/journal.pone.0004722  [/Biology and life sciences/Cell biology/Cellu...\n",
        "10.1371/journal.pone.0076865  [/Biology and life sciences/Biochemistry/DNA/D...\n",
        "10.1371/journal.pbio.0040157  [/Research and analysis methods/Research asses...\n",
        "10.1371/journal.pone.0080851  [/Biology and life sciences/Biochemistry/Prote...\n",
        "\n",
        "[5 rows x 1 columns]"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "# Let's make sure we are counting articles correctly for each subject node.\n",
      "\n",
      "def count_articles(df, subject_path):\n",
      "    s = df.subject.apply(lambda s: str(s))\n",
      "    matching = s[s.str.contains(subject_path)]\n",
      "    return len(matching)\n",
      "\n",
      "print 'Total articles:', len(df)\n",
      "print 'Science policy:', count_articles(df, 'Science policy')\n",
      "print 'Science policy/Bioethics:', count_articles(df, 'Science policy/Bioethics')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "Total articles: 115489\n",
        "Science policy: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "518\n",
        "Science policy/Bioethics: "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "71\n"
       ]
      }
     ],
     "prompt_number": 10
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "def tree_from_spreadsheet(f, df, verbose=False):\n",
      "    \n",
      "    subjects = df.subject.apply(lambda s: str(s))\n",
      "    \n",
      "    book = xlrd.open_workbook(f)\n",
      "    pt = book.sheet_by_index(0)\n",
      "    # spreadsheet cells : (row, col) :: cell A1 : (0, 0)\n",
      "\n",
      "    # Initialize a list to contain the thesaurus.\n",
      "    # Our test case will only have one item in this list.\n",
      "    pt_list = []\n",
      "\n",
      "    # Keep track of the path in the tree.\n",
      "    cur_path = Series([np.nan]*10)\n",
      "\n",
      "    for r in range(1, pt.nrows):\n",
      "        # Start on row two.\n",
      "\n",
      "        # Columns: the hierarchy goes up to 10 tiers.\n",
      "        for c in range(10):\n",
      "            if pt.cell_value(r, c):\n",
      "                # If this condition is satisfied, we are at the node that's in this line.\n",
      "\n",
      "                # Construct the path to this node.\n",
      "                # Clean strings because some terms (RNA nomenclature) cause unicode error\n",
      "                text = pt.cell_value(r, c).replace(u'\\u2019', \"'\")\n",
      "                cur_path[c] = text\n",
      "                cur_path[c+1:] = np.nan\n",
      "                path_list = list(cur_path.dropna())\n",
      "                tier = len(path_list)\n",
      "                path_str = '/'.join(path_list)\n",
      "                if verbose:\n",
      "                    print tier, path_str\n",
      "\n",
      "                # Add the node to the JSON-like tree structure.\n",
      "                node = defaultdict(list)\n",
      "                node['name'] = text\n",
      "                node['count']=  len(subjects[subjects.str.contains(path_str)])\n",
      "\n",
      "                # This part is completely ridiculous. But it seems to work.\n",
      "                if tier == 1:\n",
      "                    pt_list.append(node)\n",
      "                    pt_list.append\n",
      "                elif tier == 2:\n",
      "                    pt_list[-1]['children'].append(node)\n",
      "                elif tier == 3:\n",
      "                    pt_list[-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 4:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 5:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 6:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 7:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 8:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 9:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "                elif tier == 10:\n",
      "                    pt_list[-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'][-1]['children'].append(node)\n",
      "\n",
      "                # Go to next row after finding a term. There is only one term listed per row.\n",
      "                break\n",
      "\n",
      "    # Make a single JSON object to contain all the branches.\n",
      "    pt_obj = {'count': len(df), 'name': 'PLOS', 'children': pt_list}\n",
      "\n",
      "    return pt_obj"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 11
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Prototyping\n",
      "\n",
      "Experimenting on a smaller subset of the thesaurus: the very small ``Science policy`` branch."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "plosthes_test_file = '../data/plosthes_test.xlsx'\n",
      "\n",
      "json.dumps(tree_from_spreadsheet(plosthes_test_file, df, verbose=True))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "1 Science policy\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Bioethics\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Bioethics/Justice in science\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Bioethics/Respect for human dignity\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Bioethics/Sanctity of life\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Bioethics/Scientific beneficence\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Bioethics/Scientific nonmaleficence\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Material transfer agreements\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding/Corporate funding of science\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding/Government funding of science\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding/Institutional funding of science\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding/Military funding of science\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding/Philanthropic funding of science\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research funding/Research grants\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research integrity\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research integrity/Publication ethics\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Research integrity/Scientific misconduct\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Science and technology workforce\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Science and technology workforce/Careers in research\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Science education\n",
        "3"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Science education/Science fairs\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Science policy and economics\n",
        "2"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        " Science policy/Technology regulations\n"
       ]
      },
      {
       "metadata": {},
       "output_type": "pyout",
       "prompt_number": 12,
       "text": [
        "'{\"count\": 115489, \"name\": \"PLOS\", \"children\": [{\"count\": 518, \"name\": \"Science policy\", \"children\": [{\"count\": 71, \"name\": \"Bioethics\", \"children\": [{\"count\": 1, \"name\": \"Justice in science\"}, {\"count\": 2, \"name\": \"Respect for human dignity\"}, {\"count\": 1, \"name\": \"Sanctity of life\"}, {\"count\": 2, \"name\": \"Scientific beneficence\"}, {\"count\": 1, \"name\": \"Scientific nonmaleficence\"}]}, {\"count\": 1, \"name\": \"Material transfer agreements\"}, {\"count\": 198, \"name\": \"Research funding\", \"children\": [{\"count\": 17, \"name\": \"Corporate funding of science\"}, {\"count\": 67, \"name\": \"Government funding of science\"}, {\"count\": 18, \"name\": \"Institutional funding of science\"}, {\"count\": 2, \"name\": \"Military funding of science\"}, {\"count\": 4, \"name\": \"Philanthropic funding of science\"}, {\"count\": 91, \"name\": \"Research grants\"}]}, {\"count\": 96, \"name\": \"Research integrity\", \"children\": [{\"count\": 68, \"name\": \"Publication ethics\"}, {\"count\": 17, \"name\": \"Scientific misconduct\"}]}, {\"count\": 31, \"name\": \"Science and technology workforce\", \"children\": [{\"count\": 31, \"name\": \"Careers in research\"}]}, {\"count\": 26, \"name\": \"Science education\", \"children\": [{\"count\": 0, \"name\": \"Science fairs\"}]}, {\"count\": 3, \"name\": \"Science policy and economics\"}, {\"count\": 6, \"name\": \"Technology regulations\"}]}]}'"
       ]
      }
     ],
     "prompt_number": 12
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "## Ready for full conversion and export?\n",
      "\n",
      "Warning: it takes about 90 minutes."
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "plosthes_full_file = '../data/plosthes.2014-1.full.xlsx'\n",
      "\n",
      "# Generate tree structure\n",
      "# Change to verbose=True if you want to see it happening. \n",
      "# (Fills up the output cell with ~10000 lines.)\n",
      "plos_tree = tree_from_spreadsheet(plosthes_full_file, df, verbose=False)\n",
      "\n",
      "# Export tree structure as JSON\n",
      "# (changed output filename to something we aren't currently using)\n",
      "with open('../data/plos_hierarchy_full.json', 'wb') as f:\n",
      "    json.dump(plos_tree, f)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}