{
 "metadata": {
  "name": "HANDSON_causality"
 },
 "nbformat": 3,
 "nbformat_minor": 0,
 "worksheets": [
  {
   "cells": [
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "!ls data/cause-effect/CEdata_text/PUBLIC/\n",
      "from os import path\n",
      "data_path = './data/cause-effect/CEdata_text/PUBLIC/'\n",
      "train_pairs_file = path.abspath(path.join(data_path, 'CEdata_train_pairs.csv'))\n",
      "train_info_file = path.join(data_path, 'CEdata_train_publicinfo.csv')\n",
      "train_target_file = path.join(data_path, 'CEdata_train_target.csv')"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "CEdata_train_pairs.csv\t     CEdata_valid_pairs.csv\r\n",
        "CEdata_train_publicinfo.csv  CEdata_valid_publicinfo.csv\r\n",
        "CEdata_train_target.csv      train_records.pkl\r\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "\n",
      "!head -n 2 data/cause-effect/CEdata_text/PUBLIC/CEdata_train_target.csv\n",
      "print ''\n",
      "!head -n 2 data/cause-effect/CEdata_text/PUBLIC/CEdata_train_publicinfo.csv\n",
      "print ''\n",
      "!head -n 2 data/cause-effect/CEdata_text/PUBLIC/CEdata_train_pairs.csv\n",
      "print ''"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "SampleID,Target [1 for A->B; -1 otherwise],Details [1 for A->B; 2 for A<-B; 3 for A-B; 4 for A|B]\r\n",
        "train1,0,3\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "SampleID,A type,B type\r\n",
        "train1,Numerical,Numerical\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "SampleID,A,B\r\n",
        "train1, 2092 1143 390 1424 1277 1833 905 980 1488 1451 2091 1911 1674 1137 1687 2227 1556 2472 2502 1296 1515 1892 2473 1830 1726 1831 1667 1307 1873 1336 1725 1154 870 639 1196 1339 2047 9999 2734 2358 2099 1517 1125 1541 2022 1894 989 1519 1367 1140 1488 1162 1790 1277 1787 922 2047 377 1550 2257 843 1536 2562 1752 1718 1113 1916 1608 1413 883 1208 1158 1945 2263 2185 3296 1393 1451 1535 981 1709 2237 2947 1790 1681 1273 2499 2062 1829 1014 1820 1867 2594 1459 2290 843 1560 630 2271 2414 872 1654 1940 1848 726 2108 1316 1498 1384 1760 1471 1055 1320 1012 2147 2089 2333 1290 1972 1629 1124 1445 2039 2203 1075 1002 1516 2115 1707 2137 1255 1684 1317 1632 877 1629 1629 2063 1431 3759 1718 1848 1208 2349 2192 1510 2276 809 1508 814 1399 1521 1761 1029 862 1147 1742 1308 1784 690 1449 225 769 1924 1734 827 1481 766 2060 1541 1516 1643 2022 1107 2631 1954 1642 1100 1861 1618 1202 1529 851 1853 1385 1346 2244 2173 1625 1570 1730 915 456 1591 1675 1015 2082 2011 2089 1066 1833 1845 2187 1835 1503 1501 713 1421 777 1130 1481 1044 1202 929 2628 1841 1698 1114 4039 2476 846 1552 1719 766 1574 1519 1148 1530 1797 1692 1236 909 1422 1286 1783 2405 669 1669 1564 992 1678 1767 1183 2272 1899 2137 1916 1191 1849 1581 1951 1429 1480 1662 2157 1738 609 1315 2184 1212 2685 1889 2089 1727 1447 881 1447 2075 1223 1790 1746 1936 2083 1597 759 2399 1712 2004 881 2185 2321 1232 1208 1444 2021 1348 1474 1801 2006 1807 2171 1719 1634 1096 1976 1394 1525 1915 1621 1934 2327 1281 1002 893 1758 1933 3196 1799 2362 2199 2456 1394 564 1416 2272 1495 657 1221 1343 1813 1682 1200 1546 1194 2282 1537 2780 1680 1949 1124 1302 1943 1235 1645 1331 1302 1537 1041 1461 2131 1753 1836 2076 2150 2356 1647 2116 1832 2164 2190 2031 529 1800 1724 1447 1730 2308 1016 2185 1239 2845 2112 1906 1023 2007 1223 761 860 1335 1984 1796 1153 1713 2218 1287 1311 1139 3032 941 2085 1379 2276 1564 2486 1549 1404 1742 1260 2085 885 1303 645 0 1327 1819 1834 2441 1389 2131 2199 1374 2199 881 1390 1625 1630 1363 1051 907 1454 1097 2260 2388 1288 2109 2298 1216 1148 720 2293 1691 1668 3161 1938 1315 1993 1537 769 1560 1392 1062 2195 2291 1316 1518 1908 2163 1633 1764 926 947 2026 1323 2324 1917 221 803 1416 1514 1291 1543 1219 1860 1206 1757 2072 1419 929 1585 1750 1459 1803 2056 1565 1691 2163 1849 2060 1399 1956 1936 1396 472 1531 1375 1801 1604 1386 1236 1722 1703 1075 1932 1860 1532 1945 1920 1494 1356 1509 1505 2093 402 1289 2253 1686 1617 1292 1740 1967 958 1757 1524 1480 2077 1608 1122 2807 1487 2127 2303 1803 2056 2635 1861 1658 1779 2480 1512 1241 1468 983 1563 1576 1006 1659 1310 1445 2239 702 1101 2237 1894 1646 799 1416 898 1528 1917 1678 1761 1697 1542 1895 1749 1875 924 2627 2055 2196 1525 1304 1227 1566 1653 1614 1041 2531 1380 1296 1208 1287 831 1764 1881 1896 1763 2097 1930 2278 192 1523 1232 1949 859 1618 2129 613 946 1963 2650 1841 1317 515 1496 1755 2082 1988 1966 2072 636 1502 1047 1315 2470 2241 1780 1139 1043 1795 792 1632 1592 1220 1646 2124 1796 2382 3138 1877 1299 1340 2125 822 995 1457 1944 1605 1054 1885 2055 1170 1065 1361 1219 1385 1847 1652 1378 1020 1005 1137 1158 1766 1321 1499 1568 2245 2398 1766 1317 1600 1007 1415 2693 1952 1390 2864 1716 2186 1047 2572 2072 1889 2264 2140 1792 1362 2082 2129 1694 1825 1509 1721 1149 1667 2234 1218 1104 1259 1680 949 1425 2037 2200 1267 1481 2505 2397 2190 1198 1737 1823 2403 1512 996 957 1373 1725 1854 460 2363 1274 1672 2713 2169 1440 1757 3002 1790 1778 1606 1484 1643 1096 1510 2448 1870 1628 2097 2343 1540 2128 1584 1732 1405 1447 910 1493 1597 1793 2945 1849 1506 1621 1250 2837 1518 2042 2520 960 2869 1220, 5651 4449 4012 6124 7310 7608 6201 4618 6595 7414 6683 4304 6877 5517 7890 2920 5081 7259 4523 6626 6726 2494 5282 3934 5871 3590 3989 0 5445 5654 6386 7347 6084 4007 4184 6633 7567 5115 2629 1160 5089 4571 5644 4083 6096 6878 6797 4687 6993 3855 5063 6388 5963 4009 4679 8342 4182 5749 1909 5480 6457 6181 5518 2548 3797 3353 8291 3196 5236 5515 6553 3265 7034 2487 4042 6605 2550 8377 4571 6535 3984 7795 4294 5780 6312 6393 5827 7661 4199 4241 5747 6511 4235 2703 4771 2715 5034 4211 6815 6751 3558 5535 1072 6287 6434 9246 4104 5295 4549 4753 6803 5862 7764 5539 1877 7046 4008 5471 3824 5911 5005 8698 7005 6033 5300 4161 5411 5245 6174 5641 3674 6787 7035 6991 3821 1624 4455 6512 2644 8326 4349 6881 7770 8244 2177 4592 5895 2796 4786 3448 9327 3981 8366 6352 3965 5761 9265 695 2991 4126 8900 5414 4763 4415 4914 5729 7334 9306 4548 4136 5757 4808 7762 5266 6703 6505 6038 4635 2249 8647 5291 2927 2602 4999 5405 2050 7370 4722 8146 6569 3109 5523 2330 6458 9672 3635 9396 8304 5220 9242 7544 1078 7529 4023 6151 5111 3320 4086 3278 4494 1536 1564 7540 5574 6205 4238 4800 5593 7462 4143 7387 2995 4637 4833 5105 8489 2208 4777 7674 4042 4195 1730 6873 5470 3619 6324 8696 3556 5281 4346 5390 4794 7927 5105 4391 5230 4756 5075 2919 4730 4282 2548 2885 6968 450 4195 5773 5567 8475 7837 7463 6282 6104 4884 6640 6651 6489 5347 3919 5821 6405 6356 6360 7783 2993 6276 7573 3760 6169 3833 7026 6159 6081 3043 4279 5188 4799 6693 4540 9371 6126 6297 367 4288 4766 4703 5745 8566 7238 6797 1517 5746 3625 7822 5444 5132 6114 6574 8228 6120 6586 6609 6958 3570 4533 3800 7339 5658 4538 3521 6044 7593 4830 5155 5389 8549 4973 7583 5211 4721 4969 6786 6131 6521 6301 4774 5229 7865 6440 4868 6507 5107 5524 6782 6488 4844 5272 5216 2767 6019 2913 2301 7019 7987 5611 3734 6543 6752 6479 6298 6299 3386 4843 5412 3937 6816 3957 2892 5168 3708 3915 6474 5829 5133 4586 7733 6234 8058 6218 6114 5514 5940 5051 5753 4513 2863 5582 5532 4427 3498 5325 4962 4601 6055 3907 5200 9472 7938 6826 2957 8776 5452 4025 6769 8098 5245 7331 6784 6683 5062 3764 5902 7001 4968 5448 7547 4585 5194 4930 8612 7882 5263 7102 679 7843 6514 5647 6847 7570 6077 7021 4302 4743 5238 3209 6064 6502 6309 3851 5411 3345 4393 6537 4841 8123 4220 5648 3204 5160 3862 4781 4441 4550 7372 8446 4743 4311 5375 2794 5460 3858 6451 7840 5630 5464 3426 5031 7310 5455 5461 1087 5561 3781 2388 4490 6863 5541 6112 4967 6654 8543 7509 6845 1949 5963 7483 3793 6262 7278 5028 4554 4512 7473 3253 3807 7363 5873 7574 5795 4114 2769 5749 7164 7959 2580 7005 5651 7283 5087 7933 4493 4711 6176 4892 3018 5288 4143 7929 5115 3112 5343 4089 3350 5133 3987 3080 3394 2284 7822 5690 4250 2035 5163 4128 4689 5529 6570 6257 6488 6172 4929 7309 6391 5697 7363 4603 9380 3526 5788 2732 8653 6892 4990 8078 5469 4441 6604 2770 5887 6833 4670 7768 7931 5904 4667 6748 5110 7144 8560 4943 2262 3162 3683 4754 9360 5329 6236 5451 7672 9653 3565 5672 3782 2329 7169 5749 3770 4039 6242 5570 5080 5229 4755 5024 7149 7599 5249 5391 4396 6076 3435 5651 4446 2703 2717 5694 6863 3100 5948 4424 2801 5306 7547 5710 5428 4786 7152 4896 6684 4786 5865 5014 5449 5916 4676 5689 6238 7384 6190 6434 9999 5827 3002 6908 8004 4152 7316 4398 6940 3036 6775 5199 6911 5325 5780 2692 3619 3968 2186 5627 6434 6196 6284 8390 6803 6532 4250 4802 6278 5257 3688 4290 5656 6157 5626 3452 5152 4449 5341 8727 4371 7758 6616 6995 5372 8645 5614 5022 5684 4088 4006 4701 7465 7005 4555 7005 4833 5965 6582 5439 5137 6754 6126 5026 6635 4230 1921 5414 8559 5381 5088 8241 8682 3844 3335 5824 6461 3685 5651 2683 4150 9941 6408 3407 6904 4911 4504 6030 4838 6228 6974 3528 6123 3881 4057 2897 2758 3311 8074 7063 6907 5650 5192\r\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## read data from different files into a dictionary\n",
      "## dictionary format \n",
      "## {key=sample_id, value={kesy = type_A, type_b, target, detailed_target, a_sereis, b_sereis}}\n",
      "\n",
      "import re\n",
      "digit_pat = re.compile('\\d+')\n",
      "\n",
      "## 1. read sereis data\n",
      "import csv\n",
      "target_csv = csv.DictReader(open(train_target_file, 'rb'), fieldnames=('id', 'target', 'dtarget'))\n",
      "next(target_csv) ## ignore original header\n",
      "train_records = {}\n",
      "target_mapping = {1:0, -1:1, 0:2}\n",
      "for r in target_csv:\n",
      "    target = int(r['target'])\n",
      "    target = target_mapping[target]\n",
      "    dtarget = int(r['dtarget']) - 1\n",
      "    r['id'] = int(digit_pat.findall(r['id'])[0])\n",
      "    train_records[r['id']] = dict(zip(('target', 'dtarget'), (target, dtarget)))\n",
      "\n",
      "## 2. read variable type and merge\n",
      "## ONLY 3 possible types {Binary, Categorical, Numerical}\n",
      "infor_csv = csv.DictReader(open(train_info_file, 'rb'), fieldnames=('id', 'atype', 'btype'))\n",
      "next(infor_csv) ## ignore original header\n",
      "for line in infor_csv:\n",
      "    line['id'] = int(digit_pat.findall(line['id'])[0])\n",
      "    train_records[line['id']].update({f:line[f] for f in ('atype', 'btype')})\n",
      "    \n",
      "## 3 read series and merge\n",
      "pairs_csv = csv.DictReader(open(train_pairs_file, 'rb'), fieldnames=('id', 'avalue', 'bvalue'))\n",
      "next(pairs_csv)\n",
      "for line in pairs_csv:\n",
      "    sampleid = int(digit_pat.findall(line['id'])[0])\n",
      "    aconverter = float if train_records[sampleid]['atype'] == 'Numerical' else int\n",
      "    bconverter = float if train_records[sampleid]['btype'] == 'Numerical' else int\n",
      "    avalue = map(aconverter, line['avalue'].split())\n",
      "    bvalue = map(bconverter, line['bvalue'].split())\n",
      "    train_records[sampleid].update(dict(zip(('avalue','bvalue'), (avalue, bvalue))))\n",
      "    \n",
      "#print len(train_records)\n",
      "#print train_records.items()[:3]\n",
      "print type(train_records[1]['avalue'])\n",
      "target_labels = ['a->b', 'b->a', 'no']\n",
      "dtarget_labels = ['a->b', 'b->a', 'a-b', 'a|b']\n",
      "print np.all(train_records.keys() == range(1, 7832))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "<type 'list'>\n",
        "True\n"
       ]
      }
     ],
     "prompt_number": 55
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## pickle tarin_records\n",
      "import cPickle\n",
      "train_records_file = path.join(data_path, 'train_records.pkl')\n",
      "cPickle.dump((train_records, target_labels, dtarget_labels), open(train_records_file, 'wb'))"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 56
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "***Load data from pickle***"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "import cPickle\n",
      "from os import path\n",
      "data_path = './data/cause-effect/CEdata_text/PUBLIC/'\n",
      "train_records_file = path.join(data_path, 'train_records.pkl')\n",
      "train_records, target_labels, dtarget_labels = cPickle.load(open(train_records_file, 'rb'))\n",
      "print len(train_records)\n",
      "print train_records[1].keys()\n",
      "print train_records.keys()[:10]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "7831\n",
        "['dtarget', 'target', 'avalue', 'atype', 'btype', 'bvalue']\n",
        "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
       ]
      }
     ],
     "prompt_number": 1
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "print dtarget_labels"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['a->b', 'b->a', 'a-b', 'a|b']\n"
       ]
      }
     ],
     "prompt_number": 2
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "***Profiling***\n",
      "\n",
      "- target distribution\n",
      "- relation between target and detailed_target\n",
      "- distribution of input numerical/binary/categorial combination\n",
      "- len of pair series - are they always of the same length"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## target distribution\n",
      "import numpy as np\n",
      "targets = np.asarray([train_records[k]['target'] for k in train_records])\n",
      "dtargets = np.asarray([train_records[k]['dtarget'] for k in train_records])\n",
      "target_values = np.unique(targets) \n",
      "print [(target_labels[v], sum(targets==v), sum(targets==v)*1./len(targets)) for v in target_values]\n",
      "dtarget_values = np.unique(dtargets)\n",
      "print [(dtarget_labels[v], sum(dtargets==v), sum(dtargets==v)*1./len(dtargets)) for v in dtarget_values]\n",
      "## [(0, 1909, 0.2437747414123356), (1, 4002, 0.51104584344272763), (2, 1920, 0.2451794151449368)]\n",
      "## [(0, 1920, 0.2451794151449368), (1, 1909, 0.2437747414123356), (2, 2032, 0.2594815476950581), (3, 1970, 0.25156429574766953)]\n"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[('a->b', 1920, 0.2451794151449368), ('b->a', 1909, 0.2437747414123356), ('no', 4002, 0.51104584344272763)]\n",
        "[('a->b', 1920, 0.2451794151449368), ('b->a', 1909, 0.2437747414123356), ('a-b', 2032, 0.2594815476950581), ('a|b', 1970, 0.25156429574766953)]\n"
       ]
      }
     ],
     "prompt_number": 3
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## relation between target and dtarget\n",
      "target_vs_dtarget = np.unique(zip(targets, dtargets))\n",
      "print [(target_labels[t], dtarget_labels[dt]) for t,dt in target_vs_dtarget]\n",
      "print [((target_labels[t], dtarget_labels[dt]), sum(np.logical_and(targets==t, dtargets==dt))) for (t, dt) in target_vs_dtarget]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[('a->b', 'a->b'), ('b->a', 'b->a'), ('no', 'a-b'), ('no', 'a|b')]\n",
        "[(('a->b', 'a->b'), 1920), (('b->a', 'b->a'), 1909), (('no', 'a-b'), 2032), (('no', 'a|b'), 1970)]\n"
       ]
      }
     ],
     "prompt_number": 4
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## distribution of input types\n",
      "a_types = np.asarray([train_records[k]['atype'] for k in train_records])\n",
      "b_types = np.asarray([train_records[k]['btype'] for k in train_records])\n",
      "uniq_a_types = np.unique(a_types)\n",
      "uniq_b_types = np.unique(b_types)\n",
      "print uniq_a_types\n",
      "print [ (v, sum(a_types==v), sum(a_types==v)*1./len(a_types)) for v in uniq_a_types]\n",
      "print uniq_b_types\n",
      "print [ (v, sum(b_types==v), sum(b_types==v)*1./len(b_types)) for v in uniq_b_types]\n",
      "uniq_a_vs_b_types =  np.unique(zip(a_types, b_types))\n",
      "print uniq_a_vs_b_types\n",
      "[((a, b), sum(np.logical_and(a_types==a, b_types==b))) for a,b in uniq_a_vs_b_types]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['Binary' 'Categorical' 'Numerical']\n",
        "[('Binary', 1049, 0.13395479504533264), ('Categorical', 1099, 0.14033967564806538), ('Numerical', 5683, 0.72570552930660193)]\n",
        "['Binary' 'Categorical' 'Numerical']\n",
        "[('Binary', 1040, 0.13280551653684075), ('Categorical', 1108, 0.14148895415655727), ('Numerical', 5683, 0.72570552930660193)]\n",
        "[['Binary' 'Binary']\n",
        " ['Binary' 'Categorical']\n",
        " ['Binary' 'Numerical']\n",
        " ['Categorical' 'Binary']\n",
        " ['Categorical' 'Categorical']\n",
        " ['Categorical' 'Numerical']\n",
        " ['Numerical' 'Binary']\n",
        " ['Numerical' 'Categorical']\n",
        " ['Numerical' 'Numerical']]\n"
       ]
      },
      {
       "output_type": "pyout",
       "prompt_number": 5,
       "text": [
        "[(('Binary', 'Binary'), 203),\n",
        " (('Binary', 'Categorical'), 205),\n",
        " (('Binary', 'Numerical'), 641),\n",
        " (('Categorical', 'Binary'), 173),\n",
        " (('Categorical', 'Categorical'), 196),\n",
        " (('Categorical', 'Numerical'), 730),\n",
        " (('Numerical', 'Binary'), 664),\n",
        " (('Numerical', 'Categorical'), 707),\n",
        " (('Numerical', 'Numerical'), 4312)]"
       ]
      }
     ],
     "prompt_number": 5
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## len of pair series - are they always of the same length\n",
      "a_pair_lens = np.asarray(map(len, [train_records[k]['avalue'] for k in train_records]))\n",
      "b_pair_lens = np.asarray(map(len, [train_records[k]['bvalue'] for k in train_records]))\n",
      "ab_lens = zip(a_pair_lens, b_pair_lens)\n",
      "np.all(alen==blen for (alen, blen) in ab_lens)\n",
      "print a_pair_lens[:5]\n",
      "#pearsonr(train_records[1]['avalue'], train_records[1]['bvalue'])[0]"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "[ 733 1568 3724 3845 1904]\n"
       ]
      }
     ],
     "prompt_number": 7
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "***Feature Extraction***"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## helper functions\n",
      "import pandas as pd\n",
      "from functools import partial\n",
      "from scipy.stats.stats import pearsonr\n",
      "import numpy as np\n",
      "\n",
      "\n",
      "def cols_extractor(data_records, colnames):\n",
      "    \"\"\"\n",
      "    extract the column under name colname from data_coreds method (dict of dict)\n",
      "    \"\"\"\n",
      "    data_keys = data_records.keys()\n",
      "    return {col: np.asarray([data_records[k][col] for k in data_keys]) \n",
      "            for col in colnames}\n",
      "\n",
      "def dummy_encoder(data_records, colnames):\n",
      "    results = {}\n",
      "    for colname in colnames:\n",
      "        colvalues = np.asarray([data_records[k][colname] for k in data_records])\n",
      "        unique_col_values = np.unique(colvalues)\n",
      "        results.update({colname+'_'+str(v): (colvalues==v).astype(np.int) for v in unique_col_values} )\n",
      "    return results\n",
      "\n",
      "def series_len_extractor(data_records, colnames):\n",
      "    results = {}\n",
      "    for colname in colnames:\n",
      "        results.update({colname: [len(data_records[k][colname]) for k in data_records]})\n",
      "    return results\n",
      "\n",
      "def pearson_coeff_extractor(data_records, colpair):\n",
      "    field_name = 'pearson_'+'_vs_'.join(colpair)\n",
      "    col1, col2 = colpair\n",
      "    return {field_name: np.asarray([pearsonr(data_records[k][col1], data_records[k][col2])[0] \n",
      "                for k in data_records])}\n",
      "\n",
      "def pearson_pvalue_extractor(data_records, colpair):\n",
      "    field_name = 'pearson_pvalue_'+'_vs_'.join(colpair)\n",
      "    col1, col2 = colpair\n",
      "    return {field_name: np.asarray([pearsonr(data_records[k][col1], data_records[k][col2])[1] \n",
      "                for k in data_records])}\n",
      "\n",
      "def extract_features(records, extractors):\n",
      "    \"\"\"\n",
      "    records = dictrionary of dictionary, key: sampleID, value: dict of field names and values\n",
      "    extractors = dict, key: feature extractor name, \n",
      "        value: funtion taking data_records as the only param, and return dict of {colname: col}\n",
      "    return = pandas object\n",
      "    \"\"\"\n",
      "    results = {}\n",
      "    for (_, fextractor) in extractors.items():\n",
      "        results.update(fextractor(records))\n",
      "    return pd.DataFrame(data = results, index = records.keys())"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 8
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## test helper functions\n",
      "\n",
      "atype_values = cols_extractor(train_records, ['atype'])\n",
      "print atype_values\n",
      "assert type(atype_values) == dict\n",
      "assert atype_values['atype'].shape == (7831, )\n",
      "#print dummy_extractor(train_records, ['atype', 'btype'])\n",
      "df = extract_features(train_records, {\n",
      "       'a_b_dummy_types': partial(dummy_encoder, colnames = ['atype', 'btype'])\n",
      "     , 'target': partial(cols_extractor, colnames = ['target', 'dtarget'])\n",
      "     , 'a_b_lens': partial(series_len_extractor, colnames = ['avalue', 'bvalue'])\n",
      "     , 'a_b_pearsonr': partial(pearson_coeff_extractor, colpair=['avalue', 'bvalue'])\n",
      "     , 'a_b_pearsonpvalue': partial(pearson_pvalue_extractor, colpair=['avalue', 'bvalue'])\n",
      "})\n",
      "df.columns\n",
      "\n",
      "\n",
      "\n",
      "print df.head()\n",
      "print df"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "{'atype': array(['Numerical', 'Numerical', 'Numerical', ..., 'Binary', 'Categorical',\n",
        "       'Numerical'], \n",
        "      dtype='|S11')}\n",
        "   atype_Binary  atype_Categorical  atype_Numerical  avalue  btype_Binary  \\\n",
        "1             0                  0                1     733             0   \n",
        "2             0                  0                1    1568             0   \n",
        "3             0                  0                1    3724             1   \n",
        "4             0                  0                1    3845             0   \n",
        "5             0                  0                1    1904             0   \n",
        "\n",
        "   btype_Categorical  btype_Numerical  bvalue  dtarget  \\\n",
        "1                  0                1     733        2   \n",
        "2                  1                0    1568        2   \n",
        "3                  0                0    3724        0   \n",
        "4                  0                1    3845        0   \n",
        "5                  0                1    1904        1   \n",
        "\n",
        "   pearson_avalue_vs_bvalue  pearson_pvalue_avalue_vs_bvalue  target  \n",
        "1                  0.062958                     8.851131e-02       2  \n",
        "2                 -0.174862                     3.114540e-12       2  \n",
        "3                  0.170234                     1.299030e-25       0  \n",
        "4                  0.827731                     0.000000e+00       0  \n",
        "5                  0.531925                    1.468736e-139       1  "
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "\n",
        "<class 'pandas.core.frame.DataFrame'>\n",
        "Int64Index: 7831 entries, 1 to 7831\n",
        "Data columns (total 12 columns):\n",
        "atype_Binary                       7831  non-null values\n",
        "atype_Categorical                  7831  non-null values\n",
        "atype_Numerical                    7831  non-null values\n",
        "avalue                             7831  non-null values\n",
        "btype_Binary                       7831  non-null values\n",
        "btype_Categorical                  7831  non-null values\n",
        "btype_Numerical                    7831  non-null values\n",
        "bvalue                             7831  non-null values\n",
        "dtarget                            7831  non-null values\n",
        "pearson_avalue_vs_bvalue           7831  non-null values\n",
        "pearson_pvalue_avalue_vs_bvalue    7831  non-null values\n",
        "target                             7831  non-null values\n",
        "dtypes: float64(2), int64(10)\n"
       ]
      }
     ],
     "prompt_number": 9
    },
    {
     "cell_type": "markdown",
     "metadata": {},
     "source": [
      "***Classification***"
     ]
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## divide data into train and validation\n",
      "from sklearn import cross_validation\n",
      "headers = list(df.columns)\n",
      "print headers\n",
      "input_cols = [i for (i,f) in enumerate(headers) if f not in ['target', 'dtarget']]\n",
      "target_cols = [i for (i,f) in enumerate(headers) if f in ['target', 'dtarget']]\n",
      "data = np.asarray(df)\n",
      "X, y = data[:, input_cols], data[:, target_cols[0]]\n",
      "print X.shape, y.shape\n",
      "train_X, validation_X, train_y, validation_y = cross_validation.train_test_split(X, y, test_size = 0.3)\n",
      "print train_X.shape, validation_X.shape\n",
      "print train_y.shape, validation_y.shape"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "['atype_Binary', 'atype_Categorical', 'atype_Numerical', 'avalue', 'btype_Binary', 'btype_Categorical', 'btype_Numerical', 'bvalue', 'dtarget', 'pearson_avalue_vs_bvalue', 'pearson_pvalue_avalue_vs_bvalue', 'target']\n",
        "(7831, 10) (7831,)\n",
        "(5481, 10) (2350, 10)\n",
        "(5481,) (2350,)\n"
       ]
      }
     ],
     "prompt_number": 45
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## randomforest\n",
      "from sklearn.ensemble import RandomForestClassifier\n",
      "rf = RandomForestClassifier(n_jobs = -1)\n",
      "rf.fit(train_X, train_y)\n",
      "print rf.score(train_X, train_y)\n",
      "print rf.score(validation_X, validation_y)\n",
      "np.all(rf.predict(X) == y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.97409231892\n"
       ]
      },
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.444255319149\n"
       ]
      },
      {
       "output_type": "pyout",
       "prompt_number": 46,
       "text": [
        "False"
       ]
      }
     ],
     "prompt_number": 46
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [
      "## svc\n",
      "from sklearn.svm import LinearSVC\n",
      "svc = LinearSVC()\n",
      "svc.fit(train_X, train_y)\n",
      "print svc.score(train_X, train_y)\n",
      "print svc.score(validation_X, validation_y)"
     ],
     "language": "python",
     "metadata": {},
     "outputs": [
      {
       "output_type": "stream",
       "stream": "stdout",
       "text": [
        "0.247400109469\n",
        "0.240425531915\n"
       ]
      }
     ],
     "prompt_number": 47
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": [],
     "prompt_number": 34
    },
    {
     "cell_type": "code",
     "collapsed": false,
     "input": [],
     "language": "python",
     "metadata": {},
     "outputs": []
    }
   ],
   "metadata": {}
  }
 ]
}