{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# converting the ehrlink pdf data supplement to a tab seperated text\n", "\n", "This notebook converts the [pdf of indications](http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3422843/bin/amiajnl-2012-000852-s1.pdf) from:\n", "\n", "> McCoy et al. (2012) **Development and evaluation of a crowdsourcing methodology for knowledge base construction: identifying relationships between clinical problems and medications**. *Journal of the American Medical Informatics Association* [doi:10.1136/amiajnl-2012-000852](//dx.doi.org/10.1136/amiajnl-2012-000852)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import csv" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# convert pdf to text\n", "!pdftotext download/amiajnl-2012-000852-s1.pdf" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def generate_lines(path):\n", " with open(path) as read_file:\n", " for line in read_file:\n", " line = line.lstrip('\\x0c')\n", " if line.startswith('file://'):\n", " continue\n", " if not line.strip():\n", " continue\n", " yield line" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "collapsed": false }, "outputs": [], "source": [ "n_lines = 0\n", "with open('download/amiajnl-2012-000852-s1.tsv', 'w') as write_file:\n", " writer = csv.writer(write_file, delimiter='\\t')\n", " line_gen = generate_lines('download/amiajnl-2012-000852-s1.txt')\n", " reader = csv.reader(line_gen, delimiter='|', quotechar='\"', quoting=csv.QUOTE_ALL)\n", " for row in reader:\n", " row = [elem.replace('\\n', ' ') for elem in row]\n", " writer.writerow(row)\n", " n_lines += 1\n", "\n", "assert n_lines == 11166 + 1 # 1 for header" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
medication_definition_idmedicationproblem_definition_idproblempatient_link_frequencylink_ratio
070Albuterol Sulfate (5 MG/ML) 0.5% Inhalation Ne...64181Asthma60.600000
170Albuterol Sulfate (5 MG/ML) 0.5% Inhalation Ne...64205Chronic Obstructive Pulmonary Disease30.428571
272Albuterol Sulfate 2 MG/5ML Oral Syrup77891Acute Upper Respiratory Infection20.666667
\n", "
" ], "text/plain": [ " medication_definition_id \\\n", "0 70 \n", "1 70 \n", "2 72 \n", "\n", " medication problem_definition_id \\\n", "0 Albuterol Sulfate (5 MG/ML) 0.5% Inhalation Ne... 64181 \n", "1 Albuterol Sulfate (5 MG/ML) 0.5% Inhalation Ne... 64205 \n", "2 Albuterol Sulfate 2 MG/5ML Oral Syrup 77891 \n", "\n", " problem patient_link_frequency link_ratio \n", "0 Asthma 6 0.600000 \n", "1 Chronic Obstructive Pulmonary Disease 3 0.428571 \n", "2 Acute Upper Respiratory Infection 2 0.666667 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "indication_df = pandas.read_table('download/amiajnl-2012-000852-s1.tsv')\n", "indication_df[:3]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "collapsed": true }, "outputs": [], "source": [ "medication_group = indication_df.groupby(['medication_definition_id', 'medication'], as_index=False)\n", "medication_df = medication_group.agg({'problem': 'count'}).rename(columns={'problem': 'indications'})\n", "medication_df = medication_df.sort('indications', ascending=False)\n", "medication_df.to_csv('data/medications.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "collapsed": false }, "outputs": [], "source": [ "problem_group = indication_df.groupby(['problem_definition_id', 'problem'], as_index=False)\n", "medication_df = problem_group.agg({'medication': 'count'}).rename(columns={'medication': 'indications'})\n", "medication_df = medication_df.sort('indications', ascending=False)\n", "medication_df.to_csv('data/problems.tsv', sep='\\t', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.4.0" } }, "nbformat": 4, "nbformat_minor": 0 }