{ "metadata": { "name": "parse_mega_to_json.ipynb" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "json_filename = 'x.json'\n", "gexf_filename = 'x.gexf'\n", "\n", "import os\n", "#os.chdir(\"C:/Users/David/Documents/Dropbox\")\n", "os.chdir(\"C:/_Dropbox/Dropbox\")\n", "\n", "import pandas as pd\n", "df = pd.read_csv(\"gh43treetable.txt\", dtype=str)\n", " " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "code", "collapsed": false, "input": [ "df.columns = [['ancid', 'desc1', 'desc2', 'branchlength1', 'branchlength2']]" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "# replace spaces with underscores, then make a list of all ids with duplicates included, then remove duplicates\n", "id_list = []\n", "for idx, row in df.iterrows():\n", " for field in ['ancid', 'desc1', 'desc2']:\n", " df[field][idx] = (df[field][idx].strip()).replace(\" \", \"_\")\n", " id_list.append(df[field][idx])\n", "print len(id_list)\n", "id_list = list(set(id_list))\n", "print len(id_list)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "639\n", "427\n" ] } ], "prompt_number": 28 }, { "cell_type": "code", "collapsed": false, "input": [ "print df.iloc[2]\n", "print id_list[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "ancid 217\n", "desc1 216\n", "desc2 ASPNI_A2QT85\n", "branchlength1 0.0000000000\n", "branchlength2 0.0000000000\n", "Name: 2, dtype: object\n", "['Paeby1p7_018872', 'Paeby1p7_018871', 'bri_CHGT_02150', 'ABN43C_PENCH', '344', '345', 'ABN43A_PENCH', 'CORTH_1_02834', '340', '341']\n" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "# make dicts of IDs and positions in id_list\n", "dict_pti = {}\n", "dict_itp = {}\n", "for pos in range(len(id_list)):\n", " dict_pti[pos] = id_list[pos]\n", " dict_itp[id_list[pos]] = pos" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [ "#make link list\n", "link_list = []\n", "for idx, row in df.iterrows():\n", " for descnum in ['desc1', 'desc2']:\n", " templist = []\n", " templist.append(df.ancid[idx])\n", " templist.append(df[descnum][idx])\n", " link_list.append(templist)\n", "print link_list[:10]" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "[['215', 'ABN43A_ASPNG'], ['215', 'ANIG203143G'], ['216', '215'], ['216', 'ASPNI_P42256'], ['217', '216'], ['217', 'ASPNI_A2QT85'], ['218', 'PENCH_Q5H7M8'], ['218', 'ABN43A_PENCH'], ['219', 'Paeby1p7_018872'], ['219', '218']]\n" ] } ], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "# write gexf\n", "with open(json_filename, \"w\") as f:\n", " f.write('\\n \\n Gexf.net\\n')\n", "with open(json_filename, \"a\") as f:\n", " f.write(' ')\n", " f.write(\n", " for pos in range(len(id_list)):\n", " f.write(' {\"name\": \"' + dict_pti[pos] + '\"}')\n", " if pos < len(id_list) - 1:\n", " f.write(',\\n')\n", " else:\n", " f.write('\\n')\n", " f.write(' ],\\n \"links\": [\\n')\n", " for pos in range(len(link_list)):\n", " f.write(' {\"source\": ' + str(dict_itp[link_list[pos][0]]) + ', \"target\": ' + str(dict_itp[link_list[pos][1]]) + '}')\n", " if pos < len(link_list) - 1:\n", " f.write(',\\n')\n", " else:\n", " f.write('\\n')\n", " f.write(' ]\\n}')\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 52 }, { "cell_type": "code", "collapsed": false, "input": [ "# write gexf\n", "with open(gexf_filename, \"w\") as f:\n", " f.write(" ], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }