{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "Week2_Assignment.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "qn8CLGQ1U_sw"
      },
      "source": [
        "\n",
        "Assignment 1 Part 1: Graph Class"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "00cUFtx6Sixf",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 50
        },
        "outputId": "be253fd7-1337-4755-e4f3-f66b042c2b79"
      },
      "source": [
        "graph_elements = { \"Apple\" : [\"Banana\",\"Cherry\"],\n",
        "                \"Banana\" : [\"Apple\", \"Durian\"],\n",
        "                \"Cherry\" : [\"Apple\", \"Durian\"],\n",
        "                \"Durian\" : [\"Orange\"],\n",
        "                \"Orange\" : [\"Durian\"]\n",
        "                }\n",
        "\n",
        "class graph:\n",
        "\n",
        "    def __init__(self, value=None):\n",
        "      self.value = value\n",
        "\n",
        "    def getVertices(self):\n",
        "      return list(self.value.keys())\n",
        "\n",
        "    def getEdges(self):\n",
        "      return self.edges()\n",
        "\n",
        "    def edges(self):\n",
        "        edgelist = []\n",
        "        for vertex in self.value:\n",
        "            for neighbor in self.value[vertex]:\n",
        "                if {neighbor, vertex} not in edgelist:\n",
        "                    edgelist.append({vertex, neighbor})\n",
        "        return edgelist\n",
        "\n",
        "g = graph(graph_elements)\n",
        "\n",
        "\n",
        "print(g.getVertices())\n",
        "print(g.getEdges())"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "['Apple', 'Banana', 'Cherry', 'Durian', 'Orange']\n",
            "[{'Apple', 'Banana'}, {'Cherry', 'Apple'}, {'Durian', 'Banana'}, {'Cherry', 'Durian'}, {'Orange', 'Durian'}]\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "QzmgYXn5VVk1"
      },
      "source": [
        "Depth First Traversal of a graph"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "4wh5icuWSkt8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "37d45899-4560-4535-807f-0f8713b8cec1"
      },
      "source": [
        "def depth_first_search(graph, start, visited=None):\n",
        "  if visited is None:\n",
        "    visited = set()\n",
        "  visited.add(start)\n",
        "\n",
        "  #print(start)\n",
        "\n",
        "  for next in graph[start] - visited:\n",
        "    depth_first_search(graph, next, visited)\n",
        "  return visited\n",
        "\n",
        "graph = {'Apple': set(['Banana', 'Cherry']),\n",
        "         'Banana': set(['Apple', 'Durian', 'Orange']),\n",
        "         'Cherry': set(['Apple']),\n",
        "         'Durian': set(['Banana', 'Orange']),\n",
        "         'Orange': set(['Cherry', 'Durian'])\n",
        "         }\n",
        "\n",
        "\n",
        "depth_first_search(graph, 'Banana')"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "{'Apple', 'Banana', 'Cherry', 'Durian', 'Orange'}"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 102
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "90iwo9W1Wanw"
      },
      "source": [
        "Breadth-First Traversal of a Graph"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_QEoUEm-SkhT",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "c283fe39-4004-4770-a3bc-3b80a1102dbb"
      },
      "source": [
        "import collections\n",
        "\n",
        "def breadth_first_search(graph, root):\n",
        "\n",
        "    visited, queue = set(), collections.deque([root])\n",
        "    visited.add(root)\n",
        "\n",
        "    while queue:\n",
        "\n",
        "        vertex = queue.popleft()\n",
        "        print(str(vertex) + \" \", end=\"\")\n",
        "\n",
        "        for neighbour in graph[vertex]:\n",
        "            if neighbour not in visited:\n",
        "                visited.add(neighbour)\n",
        "                queue.append(neighbour)\n",
        "\n",
        "\n",
        "if __name__ == '__main__':\n",
        "\n",
        "    graph = {0: [1, 2], \n",
        "             1: [0, 2], \n",
        "             2: [3], \n",
        "             3: [1, 2]\n",
        "             }\n",
        "\n",
        "\n",
        "breadth_first_search(graph, 1)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "1 0 2 3 "
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "nHtTTpI5SkHs"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "_DAwWMYGWwcW"
      },
      "source": [
        "Assignment 1 Part 2: Election Data Search"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XytUX-nemkr0"
      },
      "source": [
        "import pandas as pd"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "k0V_IGLnl7P7",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 84
        },
        "outputId": "e6919695-6c1f-4631-c315-2967cd1b2d02"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/data/')\n",
        "data_dir = '/data/My Drive/Colab Notebooks/FEC dataset'\n",
        "!ls '/data/My Drive/Colab Notebooks/FEC dataset'"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Drive already mounted at /data/; to attempt to forcibly remount, call drive.mount(\"/data/\", force_remount=True).\n",
            "ccl20.zip\t     cm_header_file.csv  pas220.zip\n",
            "ccl_header_file.csv  cn20.zip\t\t pas2_header_file.csv\n",
            "cm20.zip\t     cn_header_file.csv\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "pl-0E8cE-hGy"
      },
      "source": [
        "Search on 'CN20.zip' folder"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zw1KZ0NmmWQw",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 400
        },
        "outputId": "cfb24d9f-b184-42d8-9b4d-5c5ad20b32b7"
      },
      "source": [
        "from zipfile import ZipFile\n",
        "\n",
        "header = pd.read_csv(data_dir+'/cn_header_file.csv')\n",
        "\n",
        "with ZipFile(data_dir+'/cn20.zip') as zip:\n",
        "  candidates = pd.read_csv(zip.open('cn.txt'), sep='|', names=header.columns)\n",
        "candidates.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_PTY_AFFILIATION</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>CAND_OFFICE_ST</th>\n",
              "      <th>CAND_OFFICE</th>\n",
              "      <th>CAND_OFFICE_DISTRICT</th>\n",
              "      <th>CAND_ICI</th>\n",
              "      <th>CAND_STATUS</th>\n",
              "      <th>CAND_PCC</th>\n",
              "      <th>CAND_ST1</th>\n",
              "      <th>CAND_ST2</th>\n",
              "      <th>CAND_CITY</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CAND_ZIP</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>H0AK00105</td>\n",
              "      <td>LAMB, THOMAS</td>\n",
              "      <td>NNE</td>\n",
              "      <td>2020</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00607515</td>\n",
              "      <td>1861 W LAKE LUCILLE DR</td>\n",
              "      <td>NaN</td>\n",
              "      <td>WASILLA</td>\n",
              "      <td>AK</td>\n",
              "      <td>99654.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>H0AK00113</td>\n",
              "      <td>TUGATUK, RAY SEAN</td>\n",
              "      <td>DEM</td>\n",
              "      <td>2020</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>PO BOX 172</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MANAKOTAK</td>\n",
              "      <td>AK</td>\n",
              "      <td>99628.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>H0AK01046</td>\n",
              "      <td>CATALANO, THOMAS</td>\n",
              "      <td>OTH</td>\n",
              "      <td>2020</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>188 WEST NORTHERN LIGHTS BOULEVARD</td>\n",
              "      <td>NaN</td>\n",
              "      <td>ANCHORAGE</td>\n",
              "      <td>AK</td>\n",
              "      <td>99503.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>H0AL01055</td>\n",
              "      <td>CARL, JERRY LEE, JR</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00697789</td>\n",
              "      <td>PO BOX 852138</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MOBILE</td>\n",
              "      <td>AL</td>\n",
              "      <td>36685.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>H0AL01063</td>\n",
              "      <td>LAMBERT, DOUGLAS WESTLEY III</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00701557</td>\n",
              "      <td>7194 STILLWATER BLVD</td>\n",
              "      <td>NaN</td>\n",
              "      <td>SPANISH FORT</td>\n",
              "      <td>AL</td>\n",
              "      <td>36527.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CAND_ID                     CAND_NAME  ... CAND_ST  CAND_ZIP\n",
              "0  H0AK00105                  LAMB, THOMAS  ...      AK   99654.0\n",
              "1  H0AK00113             TUGATUK, RAY SEAN  ...      AK   99628.0\n",
              "2  H0AK01046              CATALANO, THOMAS  ...      AK   99503.0\n",
              "3  H0AL01055           CARL, JERRY LEE, JR  ...      AL   36685.0\n",
              "4  H0AL01063  LAMBERT, DOUGLAS WESTLEY III  ...      AL   36527.0\n",
              "\n",
              "[5 rows x 15 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 106
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "CZ2y9RTAnTzU",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 333
        },
        "outputId": "d698ba69-e8bf-4db5-b657-c79efd645bdf"
      },
      "source": [
        "candidates[candidates['CAND_NAME'].str.contains('WALKER')].head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_PTY_AFFILIATION</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>CAND_OFFICE_ST</th>\n",
              "      <th>CAND_OFFICE</th>\n",
              "      <th>CAND_OFFICE_DISTRICT</th>\n",
              "      <th>CAND_ICI</th>\n",
              "      <th>CAND_STATUS</th>\n",
              "      <th>CAND_PCC</th>\n",
              "      <th>CAND_ST1</th>\n",
              "      <th>CAND_ST2</th>\n",
              "      <th>CAND_CITY</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CAND_ZIP</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>97</th>\n",
              "      <td>H0AZ09054</td>\n",
              "      <td>WALKER, JASON S</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AZ</td>\n",
              "      <td>H</td>\n",
              "      <td>9.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>4815 E. THOMAS RD.</td>\n",
              "      <td>W224</td>\n",
              "      <td>PHOENIX</td>\n",
              "      <td>AZ</td>\n",
              "      <td>85018.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>848</th>\n",
              "      <td>H0IL15103</td>\n",
              "      <td>WALKER, ALEX</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>IL</td>\n",
              "      <td>H</td>\n",
              "      <td>15.0</td>\n",
              "      <td>O</td>\n",
              "      <td>N</td>\n",
              "      <td>C00717967</td>\n",
              "      <td>2606 CHARLESTON AVE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MATTOON</td>\n",
              "      <td>IL</td>\n",
              "      <td>61938.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1009</th>\n",
              "      <td>H0MA04168</td>\n",
              "      <td>GROSSMAN, REBECCA WALKER</td>\n",
              "      <td>DEM</td>\n",
              "      <td>2020</td>\n",
              "      <td>MA</td>\n",
              "      <td>H</td>\n",
              "      <td>4.0</td>\n",
              "      <td>C</td>\n",
              "      <td>C</td>\n",
              "      <td>C00720482</td>\n",
              "      <td>PO BOX 590686</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NEWTON CENTER</td>\n",
              "      <td>MA</td>\n",
              "      <td>2459.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1391</th>\n",
              "      <td>H0NC08247</td>\n",
              "      <td>WALKER, RODNEY L</td>\n",
              "      <td>DEM</td>\n",
              "      <td>2020</td>\n",
              "      <td>NC</td>\n",
              "      <td>H</td>\n",
              "      <td>8.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00707877</td>\n",
              "      <td>PO BOX 712</td>\n",
              "      <td>NaN</td>\n",
              "      <td>SOUTHERN PINES</td>\n",
              "      <td>NC</td>\n",
              "      <td>28388.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1574</th>\n",
              "      <td>H0NV03108</td>\n",
              "      <td>WALKERLIEB, ZACHARY</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>NV</td>\n",
              "      <td>H</td>\n",
              "      <td>3.0</td>\n",
              "      <td>C</td>\n",
              "      <td>C</td>\n",
              "      <td>C00703025</td>\n",
              "      <td>5887 GLORY HEIGHTS DR</td>\n",
              "      <td>NaN</td>\n",
              "      <td>LAS VEGAS</td>\n",
              "      <td>NV</td>\n",
              "      <td>89135.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "        CAND_ID                 CAND_NAME  ... CAND_ST  CAND_ZIP\n",
              "97    H0AZ09054           WALKER, JASON S  ...      AZ   85018.0\n",
              "848   H0IL15103              WALKER, ALEX  ...      IL   61938.0\n",
              "1009  H0MA04168  GROSSMAN, REBECCA WALKER  ...      MA    2459.0\n",
              "1391  H0NC08247          WALKER, RODNEY L  ...      NC   28388.0\n",
              "1574  H0NV03108       WALKERLIEB, ZACHARY  ...      NV   89135.0\n",
              "\n",
              "[5 rows x 15 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 107
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tdUayozq-9Cb",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 417
        },
        "outputId": "a26fa3cc-48cc-48aa-aef1-3195d29110a7"
      },
      "source": [
        "tg = candidates[(candidates['CAND_ELECTION_YR'] == 2020) & (candidates['CAND_OFFICE_ST'] == 'FL')]\n",
        "tg.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_PTY_AFFILIATION</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>CAND_OFFICE_ST</th>\n",
              "      <th>CAND_OFFICE</th>\n",
              "      <th>CAND_OFFICE_DISTRICT</th>\n",
              "      <th>CAND_ICI</th>\n",
              "      <th>CAND_STATUS</th>\n",
              "      <th>CAND_PCC</th>\n",
              "      <th>CAND_ST1</th>\n",
              "      <th>CAND_ST2</th>\n",
              "      <th>CAND_CITY</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CAND_ZIP</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>437</th>\n",
              "      <td>H0FL01112</td>\n",
              "      <td>ROSAS, EMILY</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>FL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00664300</td>\n",
              "      <td>6610 JENNIFER DRIVE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>TEMPLE TERRACE</td>\n",
              "      <td>FL</td>\n",
              "      <td>33617.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>438</th>\n",
              "      <td>H0FL01120</td>\n",
              "      <td>ROMAGNANO, CHASE ANDERSON ANDY REV.</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>FL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00719351</td>\n",
              "      <td>P.O. BOX 9328</td>\n",
              "      <td>NaN</td>\n",
              "      <td>PENSACOLA</td>\n",
              "      <td>FL</td>\n",
              "      <td>32513.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>439</th>\n",
              "      <td>H0FL01138</td>\n",
              "      <td>HOWARD, CHERYL</td>\n",
              "      <td>DEM</td>\n",
              "      <td>2020</td>\n",
              "      <td>FL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00735977</td>\n",
              "      <td>605 CROWN COVE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>PENSACOLA</td>\n",
              "      <td>FL</td>\n",
              "      <td>32502.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>440</th>\n",
              "      <td>H0FL01146</td>\n",
              "      <td>MERK, GREGORY CHARLES</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>FL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>C</td>\n",
              "      <td>C</td>\n",
              "      <td>C00740340</td>\n",
              "      <td>4661 CALLE VENTOSO</td>\n",
              "      <td>NaN</td>\n",
              "      <td>PENSACOLA</td>\n",
              "      <td>FL</td>\n",
              "      <td>32514.0</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>441</th>\n",
              "      <td>H0FL01153</td>\n",
              "      <td>ORAM, ALBERT</td>\n",
              "      <td>NPA</td>\n",
              "      <td>2020</td>\n",
              "      <td>FL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>C</td>\n",
              "      <td>C</td>\n",
              "      <td>C00746636</td>\n",
              "      <td>825 BAYSHORE DR APT 804</td>\n",
              "      <td>NaN</td>\n",
              "      <td>PENSACOLA</td>\n",
              "      <td>FL</td>\n",
              "      <td>32507.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       CAND_ID                            CAND_NAME  ... CAND_ST  CAND_ZIP\n",
              "437  H0FL01112                         ROSAS, EMILY  ...      FL   33617.0\n",
              "438  H0FL01120  ROMAGNANO, CHASE ANDERSON ANDY REV.  ...      FL   32513.0\n",
              "439  H0FL01138                       HOWARD, CHERYL  ...      FL   32502.0\n",
              "440  H0FL01146                MERK, GREGORY CHARLES  ...      FL   32514.0\n",
              "441  H0FL01153                         ORAM, ALBERT  ...      FL   32507.0\n",
              "\n",
              "[5 rows x 15 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 108
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ytvXx2wHCvs1",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "f379bdaf-43ee-4446-a7e5-1c4302a9ec9c"
      },
      "source": [
        "len(tg.index)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "242"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 109
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "mymZWrEypYla",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 130
        },
        "outputId": "6f97a11e-f339-4022-fd05-136583e1ab0f"
      },
      "source": [
        "candidates[candidates['CAND_NAME'].str.contains('TRUMP, DONALD')]"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_PTY_AFFILIATION</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>CAND_OFFICE_ST</th>\n",
              "      <th>CAND_OFFICE</th>\n",
              "      <th>CAND_OFFICE_DISTRICT</th>\n",
              "      <th>CAND_ICI</th>\n",
              "      <th>CAND_STATUS</th>\n",
              "      <th>CAND_PCC</th>\n",
              "      <th>CAND_ST1</th>\n",
              "      <th>CAND_ST2</th>\n",
              "      <th>CAND_CITY</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CAND_ZIP</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>6403</th>\n",
              "      <td>P80001571</td>\n",
              "      <td>TRUMP, DONALD J.</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>US</td>\n",
              "      <td>P</td>\n",
              "      <td>0.0</td>\n",
              "      <td>I</td>\n",
              "      <td>C</td>\n",
              "      <td>C00580100</td>\n",
              "      <td>725 FIFTH AVENUE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NEW YORK</td>\n",
              "      <td>NY</td>\n",
              "      <td>10022.0</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "        CAND_ID         CAND_NAME  ... CAND_ST  CAND_ZIP\n",
              "6403  P80001571  TRUMP, DONALD J.  ...      NY   10022.0\n",
              "\n",
              "[1 rows x 15 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 110
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "AoFXVna2--1J"
      },
      "source": [
        "Search on 'Pas220.zip' folder"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wIKz_EIYiHLj",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 400
        },
        "outputId": "1bb2e9a1-c58c-460c-e64d-2f73e34c291f"
      },
      "source": [
        "header = pd.read_csv(data_dir+'/pas2_header_file.csv')\n",
        "\n",
        "with ZipFile(data_dir+'/pas220.zip') as zip:\n",
        "  #print(zip.namelist())\n",
        "  spending = pd.read_csv(zip.open('itpas2.txt'), sep='|', names=header.columns)\n",
        "spending.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py:2718: DtypeWarning: Columns (10,11,12) have mixed types.Specify dtype option on import or set low_memory=False.\n",
            "  interactivity=interactivity, compiler=compiler, result=result)\n"
          ],
          "name": "stderr"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>AMNDT_IND</th>\n",
              "      <th>RPT_TP</th>\n",
              "      <th>TRANSACTION_PGI</th>\n",
              "      <th>IMAGE_NUM</th>\n",
              "      <th>TRANSACTION_TP</th>\n",
              "      <th>ENTITY_TP</th>\n",
              "      <th>NAME</th>\n",
              "      <th>CITY</th>\n",
              "      <th>STATE</th>\n",
              "      <th>ZIP_CODE</th>\n",
              "      <th>EMPLOYER</th>\n",
              "      <th>OCCUPATION</th>\n",
              "      <th>TRANSACTION_DT</th>\n",
              "      <th>TRANSACTION_AMT</th>\n",
              "      <th>OTHER_ID</th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>TRAN_ID</th>\n",
              "      <th>FILE_NUM</th>\n",
              "      <th>MEMO_CD</th>\n",
              "      <th>MEMO_TEXT</th>\n",
              "      <th>SUB_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>C00567180</td>\n",
              "      <td>T</td>\n",
              "      <td>TER</td>\n",
              "      <td>P2020</td>\n",
              "      <td>201901099143774199</td>\n",
              "      <td>24K</td>\n",
              "      <td>PAC</td>\n",
              "      <td>TED YOHO FOR CONGRESS</td>\n",
              "      <td>GAINESVILLE</td>\n",
              "      <td>FL</td>\n",
              "      <td>32608</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1082019.0</td>\n",
              "      <td>1880</td>\n",
              "      <td>C00494583</td>\n",
              "      <td>H2FL06109</td>\n",
              "      <td>SB23.4447</td>\n",
              "      <td>1303604</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>4021920191640423011</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>C00104885</td>\n",
              "      <td>A</td>\n",
              "      <td>TER</td>\n",
              "      <td>G2020</td>\n",
              "      <td>201901289144031511</td>\n",
              "      <td>24K</td>\n",
              "      <td>CCM</td>\n",
              "      <td>TEAM GRAHAM INC</td>\n",
              "      <td>COLUMBIA</td>\n",
              "      <td>SC</td>\n",
              "      <td>29202</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>12202018.0</td>\n",
              "      <td>3000</td>\n",
              "      <td>C00458828</td>\n",
              "      <td>H4SC03087</td>\n",
              "      <td>SB23.17757</td>\n",
              "      <td>1307636</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>4022220191643444985</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>C00104885</td>\n",
              "      <td>A</td>\n",
              "      <td>TER</td>\n",
              "      <td>P2022</td>\n",
              "      <td>201901289144031512</td>\n",
              "      <td>24K</td>\n",
              "      <td>CCM</td>\n",
              "      <td>TIM SCOTT FOR SENATE</td>\n",
              "      <td>CHARLESTON</td>\n",
              "      <td>SC</td>\n",
              "      <td>29407</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>12202018.0</td>\n",
              "      <td>1000</td>\n",
              "      <td>C00540302</td>\n",
              "      <td>H0SC01279</td>\n",
              "      <td>SB23.17756</td>\n",
              "      <td>1307636</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>4022220191643444987</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>C00104885</td>\n",
              "      <td>A</td>\n",
              "      <td>TER</td>\n",
              "      <td>P2020</td>\n",
              "      <td>201901289144031511</td>\n",
              "      <td>24K</td>\n",
              "      <td>CCM</td>\n",
              "      <td>FRIENDS OF JIM CLYBURN</td>\n",
              "      <td>COLUMBIA</td>\n",
              "      <td>SC</td>\n",
              "      <td>29211</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>1072019.0</td>\n",
              "      <td>470</td>\n",
              "      <td>C00255562</td>\n",
              "      <td>H2SC02042</td>\n",
              "      <td>SB23.17755</td>\n",
              "      <td>1307636</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>4022220191643444981</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>C00688408</td>\n",
              "      <td>T</td>\n",
              "      <td>TER</td>\n",
              "      <td>G2018</td>\n",
              "      <td>201901319144305867</td>\n",
              "      <td>24E</td>\n",
              "      <td>ORG</td>\n",
              "      <td>JACKSON ADVOCATE</td>\n",
              "      <td>JACKSON</td>\n",
              "      <td>MS</td>\n",
              "      <td>39207</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>12122018.0</td>\n",
              "      <td>1000</td>\n",
              "      <td>S8MS00287</td>\n",
              "      <td>S8MS00287</td>\n",
              "      <td>SE.4233</td>\n",
              "      <td>1310906</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "      <td>4022420191643632157</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CMTE_ID AMNDT_IND RPT_TP  ... MEMO_CD  MEMO_TEXT               SUB_ID\n",
              "0  C00567180         T    TER  ...     NaN        NaN  4021920191640423011\n",
              "1  C00104885         A    TER  ...     NaN        NaN  4022220191643444985\n",
              "2  C00104885         A    TER  ...     NaN        NaN  4022220191643444987\n",
              "3  C00104885         A    TER  ...     NaN        NaN  4022220191643444981\n",
              "4  C00688408         T    TER  ...     NaN        NaN  4022420191643632157\n",
              "\n",
              "[5 rows x 22 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 111
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "cJReRZAi7RSR",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "outputId": "8f41dac7-2e34-4db1-b661-2b46ab2debab"
      },
      "source": [
        "house_spending_FL = spending[(spending['CAND_ID'].str.startswith('H')==True) & (spending['STATE']=='FL')]\n",
        "df_house = house_spending_FL[['CMTE_ID', 'NAME', 'STATE', 'TRANSACTION_AMT', 'CAND_ID']]\n",
        "df_house.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>NAME</th>\n",
              "      <th>STATE</th>\n",
              "      <th>TRANSACTION_AMT</th>\n",
              "      <th>CAND_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>C00567180</td>\n",
              "      <td>TED YOHO FOR CONGRESS</td>\n",
              "      <td>FL</td>\n",
              "      <td>1880</td>\n",
              "      <td>H2FL06109</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>154</th>\n",
              "      <td>C00027342</td>\n",
              "      <td>HASTINGS FOR CONGRESS</td>\n",
              "      <td>FL</td>\n",
              "      <td>5000</td>\n",
              "      <td>H2FL23021</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>195</th>\n",
              "      <td>C00003855</td>\n",
              "      <td>DONNA SHALALA FOR CONGRESS</td>\n",
              "      <td>FL</td>\n",
              "      <td>1000</td>\n",
              "      <td>H8FL27193</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>256</th>\n",
              "      <td>C00339655</td>\n",
              "      <td>VERN BUCHANAN FOR CONGRESS</td>\n",
              "      <td>FL</td>\n",
              "      <td>5000</td>\n",
              "      <td>H6FL13148</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>421</th>\n",
              "      <td>C00035204</td>\n",
              "      <td>BILIRAKIS FOR CONGRESS</td>\n",
              "      <td>FL</td>\n",
              "      <td>1000</td>\n",
              "      <td>H6FL09070</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       CMTE_ID                        NAME STATE  TRANSACTION_AMT    CAND_ID\n",
              "0    C00567180       TED YOHO FOR CONGRESS    FL             1880  H2FL06109\n",
              "154  C00027342       HASTINGS FOR CONGRESS    FL             5000  H2FL23021\n",
              "195  C00003855  DONNA SHALALA FOR CONGRESS    FL             1000  H8FL27193\n",
              "256  C00339655  VERN BUCHANAN FOR CONGRESS    FL             5000  H6FL13148\n",
              "421  C00035204      BILIRAKIS FOR CONGRESS    FL             1000  H6FL09070"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 112
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "L-O7kKyr_OkU"
      },
      "source": [
        "Search on 'CM20.zip' folder"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "UX-S1tm7nOo2",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 518
        },
        "outputId": "145959fb-bb59-4e7a-c6e1-31f3d64148c5"
      },
      "source": [
        "header = pd.read_csv(data_dir+'/cm_header_file.csv')\n",
        "\n",
        "with ZipFile(data_dir+'/cm20.zip') as zip:\n",
        "  #print(zip.namelist())\n",
        "  df = pd.read_csv(zip.open('cm.txt'), sep='|', names=header.columns)\n",
        "df.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_NM</th>\n",
              "      <th>TRES_NM</th>\n",
              "      <th>CMTE_ST1</th>\n",
              "      <th>CMTE_ST2</th>\n",
              "      <th>CMTE_CITY</th>\n",
              "      <th>CMTE_ST</th>\n",
              "      <th>CMTE_ZIP</th>\n",
              "      <th>CMTE_DSGN</th>\n",
              "      <th>CMTE_TP</th>\n",
              "      <th>CMTE_PTY_AFFILIATION</th>\n",
              "      <th>CMTE_FILING_FREQ</th>\n",
              "      <th>ORG_TP</th>\n",
              "      <th>CONNECTED_ORG_NM</th>\n",
              "      <th>CAND_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>C00000059</td>\n",
              "      <td>HALLMARK CARDS PAC</td>\n",
              "      <td>SARAH MOE</td>\n",
              "      <td>2501 MCGEE</td>\n",
              "      <td>MD #500</td>\n",
              "      <td>KANSAS CITY</td>\n",
              "      <td>MO</td>\n",
              "      <td>64108</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>UNK</td>\n",
              "      <td>M</td>\n",
              "      <td>C</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>C00000422</td>\n",
              "      <td>AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...</td>\n",
              "      <td>WALKER, KEVIN MR.</td>\n",
              "      <td>25 MASSACHUSETTS AVE, NW</td>\n",
              "      <td>SUITE 600</td>\n",
              "      <td>WASHINGTON</td>\n",
              "      <td>DC</td>\n",
              "      <td>200017400</td>\n",
              "      <td>B</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>M</td>\n",
              "      <td>M</td>\n",
              "      <td>DELAWARE MEDICAL PAC</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>C00000489</td>\n",
              "      <td>D R I V E POLITICAL FUND CHAPTER 886</td>\n",
              "      <td>JERRY SIMS JR</td>\n",
              "      <td>3528 W RENO</td>\n",
              "      <td>NaN</td>\n",
              "      <td>OKLAHOMA CITY</td>\n",
              "      <td>OK</td>\n",
              "      <td>73107</td>\n",
              "      <td>U</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Q</td>\n",
              "      <td>L</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>C00000547</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY POLITICAL ACTION COMMITTEE</td>\n",
              "      <td>JERRY SLAUGHTER</td>\n",
              "      <td>623 SW 10TH AVE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>TOPEKA</td>\n",
              "      <td>KS</td>\n",
              "      <td>666121627</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>UNK</td>\n",
              "      <td>Q</td>\n",
              "      <td>M</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>C00000638</td>\n",
              "      <td>INDIANA STATE MEDICAL ASSOCIATION POLITICAL AC...</td>\n",
              "      <td>ACHENBACH, GRANT MR.</td>\n",
              "      <td>322 CANAL WALK, CANAL LEVEL</td>\n",
              "      <td>NaN</td>\n",
              "      <td>INDIANAPOLIS</td>\n",
              "      <td>IN</td>\n",
              "      <td>46202</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>T</td>\n",
              "      <td>M</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CMTE_ID  ... CAND_ID\n",
              "0  C00000059  ...     NaN\n",
              "1  C00000422  ...     NaN\n",
              "2  C00000489  ...     NaN\n",
              "3  C00000547  ...     NaN\n",
              "4  C00000638  ...     NaN\n",
              "\n",
              "[5 rows x 15 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 113
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Gz5FKlS11ZGQ",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "outputId": "caef84a1-84e2-4340-d070-4103622b12fa"
      },
      "source": [
        "df_name = df[['CMTE_ID', 'CMTE_NM']]\n",
        "df_name.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_NM</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>C00000059</td>\n",
              "      <td>HALLMARK CARDS PAC</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>C00000422</td>\n",
              "      <td>AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>C00000489</td>\n",
              "      <td>D R I V E POLITICAL FUND CHAPTER 886</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>C00000547</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY POLITICAL ACTION COMMITTEE</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>C00000638</td>\n",
              "      <td>INDIANA STATE MEDICAL ASSOCIATION POLITICAL AC...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CMTE_ID                                            CMTE_NM\n",
              "0  C00000059                                 HALLMARK CARDS PAC\n",
              "1  C00000422  AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...\n",
              "2  C00000489               D R I V E POLITICAL FUND CHAPTER 886\n",
              "3  C00000547  KANSAS MEDICAL SOCIETY POLITICAL ACTION COMMITTEE\n",
              "4  C00000638  INDIANA STATE MEDICAL ASSOCIATION POLITICAL AC..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 114
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MpE777-JE3Gt",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 618
        },
        "outputId": "4d62b2f1-8c4e-4678-df9b-b028eeda0ea3"
      },
      "source": [
        "Society_Organization = df[df['CONNECTED_ORG_NM'].str.contains('SOCIETY', na=False)]\n",
        "Society_Organization.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_NM</th>\n",
              "      <th>TRES_NM</th>\n",
              "      <th>CMTE_ST1</th>\n",
              "      <th>CMTE_ST2</th>\n",
              "      <th>CMTE_CITY</th>\n",
              "      <th>CMTE_ST</th>\n",
              "      <th>CMTE_ZIP</th>\n",
              "      <th>CMTE_DSGN</th>\n",
              "      <th>CMTE_TP</th>\n",
              "      <th>CMTE_PTY_AFFILIATION</th>\n",
              "      <th>CMTE_FILING_FREQ</th>\n",
              "      <th>ORG_TP</th>\n",
              "      <th>CONNECTED_ORG_NM</th>\n",
              "      <th>CAND_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>C00000547</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY POLITICAL ACTION COMMITTEE</td>\n",
              "      <td>JERRY SLAUGHTER</td>\n",
              "      <td>623 SW 10TH AVE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>TOPEKA</td>\n",
              "      <td>KS</td>\n",
              "      <td>666121627</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>UNK</td>\n",
              "      <td>Q</td>\n",
              "      <td>M</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>11</th>\n",
              "      <td>C00001180</td>\n",
              "      <td>MICHIGAN DOCTORS POLITICAL ACTION COMMITTEE - ...</td>\n",
              "      <td>GHOSE, AMIT</td>\n",
              "      <td>P.O. BOX 769</td>\n",
              "      <td>NaN</td>\n",
              "      <td>EAST LANSING</td>\n",
              "      <td>MI</td>\n",
              "      <td>48826</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>NNE</td>\n",
              "      <td>Q</td>\n",
              "      <td>T</td>\n",
              "      <td>MICHIGAN STATE MEDICAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>37</th>\n",
              "      <td>C00003152</td>\n",
              "      <td>NORTH CAROLINA MEDICAL SOCIETY FEDERAL POLITIC...</td>\n",
              "      <td>HAYES, E. REBECCA</td>\n",
              "      <td>PO BOX 25834</td>\n",
              "      <td>222 N. PERSON STREET</td>\n",
              "      <td>RALEIGH</td>\n",
              "      <td>NC</td>\n",
              "      <td>27611</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Q</td>\n",
              "      <td>M</td>\n",
              "      <td>NORTH CAROLINA MEDICAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>51</th>\n",
              "      <td>C00003970</td>\n",
              "      <td>IOWA MEDICAL SOCIETY POLITICAL ACTION COMMITTEE</td>\n",
              "      <td>DOOLEY, JOHN B</td>\n",
              "      <td>515 E LOCUST STREET</td>\n",
              "      <td>SUITE 400</td>\n",
              "      <td>DES MOINES</td>\n",
              "      <td>IA</td>\n",
              "      <td>50309</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Q</td>\n",
              "      <td>M</td>\n",
              "      <td>IOWA MEDICAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>136</th>\n",
              "      <td>C00012211</td>\n",
              "      <td>NORTH CAROLINA DENTAL SOCIETY PAC</td>\n",
              "      <td>BROWN, EVELYN M. DR.</td>\n",
              "      <td>1600 EVANS ROAD</td>\n",
              "      <td>NaN</td>\n",
              "      <td>CARY</td>\n",
              "      <td>NC</td>\n",
              "      <td>27513</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Q</td>\n",
              "      <td>M</td>\n",
              "      <td>NORTH CAROLINA DENTAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       CMTE_ID  ... CAND_ID\n",
              "3    C00000547  ...     NaN\n",
              "11   C00001180  ...     NaN\n",
              "37   C00003152  ...     NaN\n",
              "51   C00003970  ...     NaN\n",
              "136  C00012211  ...     NaN\n",
              "\n",
              "[5 rows x 15 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 115
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GErXTSsJGOV-",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "f2899955-e0ec-47b2-9310-24ba8fa7def4"
      },
      "source": [
        "len(Society_Organization.index)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "28"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 116
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "drRP2x2Z_lZS"
      },
      "source": [
        "Search on 'CCL20.zip' folder"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ombc-hjIn0KR",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "outputId": "20885340-ea71-466c-e66d-ba3684884989"
      },
      "source": [
        "header = pd.read_csv(data_dir+'/ccl_header_file.csv')\n",
        "\n",
        "with ZipFile(data_dir+'/ccl20.zip') as zip:\n",
        "  #print(zip.namelist())\n",
        "  linkage = pd.read_csv(zip.open('ccl.txt'), sep='|', names=header.columns)\n",
        "\n",
        "linkage.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>FEC_ELECTION_YR</th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_TP</th>\n",
              "      <th>CMTE_DSGN</th>\n",
              "      <th>LINKAGE_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>C00713602</td>\n",
              "      <td>2019</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00712851</td>\n",
              "      <td>O</td>\n",
              "      <td>U</td>\n",
              "      <td>228963</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>H0AK00105</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00607515</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>229250</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>H0AL01055</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00697789</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>226125</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>H0AL01063</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00701557</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227053</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>H0AL01071</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00701409</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227054</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CAND_ID  CAND_ELECTION_YR  FEC_ELECTION_YR  ... CMTE_TP CMTE_DSGN LINKAGE_ID\n",
              "0  C00713602              2019             2020  ...       O         U     228963\n",
              "1  H0AK00105              2020             2020  ...       H         P     229250\n",
              "2  H0AL01055              2020             2020  ...       H         P     226125\n",
              "3  H0AL01063              2020             2020  ...       H         P     227053\n",
              "4  H0AL01071              2020             2020  ...       H         P     227054\n",
              "\n",
              "[5 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 117
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vb8dIlKKE_4m"
      },
      "source": [
        "df1 = pd.concat([candidates, linkage, df], axis=1, sort=False).reset_index()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yEgZmnNHT2xw",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 534
        },
        "outputId": "d25d1e93-80a7-4ebd-a382-97460212b7ae"
      },
      "source": [
        "df1.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>index</th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_PTY_AFFILIATION</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>CAND_OFFICE_ST</th>\n",
              "      <th>CAND_OFFICE</th>\n",
              "      <th>CAND_OFFICE_DISTRICT</th>\n",
              "      <th>CAND_ICI</th>\n",
              "      <th>CAND_STATUS</th>\n",
              "      <th>CAND_PCC</th>\n",
              "      <th>CAND_ST1</th>\n",
              "      <th>CAND_ST2</th>\n",
              "      <th>CAND_CITY</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CAND_ZIP</th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_ELECTION_YR</th>\n",
              "      <th>FEC_ELECTION_YR</th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_TP</th>\n",
              "      <th>CMTE_DSGN</th>\n",
              "      <th>LINKAGE_ID</th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_NM</th>\n",
              "      <th>TRES_NM</th>\n",
              "      <th>CMTE_ST1</th>\n",
              "      <th>CMTE_ST2</th>\n",
              "      <th>CMTE_CITY</th>\n",
              "      <th>CMTE_ST</th>\n",
              "      <th>CMTE_ZIP</th>\n",
              "      <th>CMTE_DSGN</th>\n",
              "      <th>CMTE_TP</th>\n",
              "      <th>CMTE_PTY_AFFILIATION</th>\n",
              "      <th>CMTE_FILING_FREQ</th>\n",
              "      <th>ORG_TP</th>\n",
              "      <th>CONNECTED_ORG_NM</th>\n",
              "      <th>CAND_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>H0AK00105</td>\n",
              "      <td>LAMB, THOMAS</td>\n",
              "      <td>NNE</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00607515</td>\n",
              "      <td>1861 W LAKE LUCILLE DR</td>\n",
              "      <td>NaN</td>\n",
              "      <td>WASILLA</td>\n",
              "      <td>AK</td>\n",
              "      <td>99654.0</td>\n",
              "      <td>C00713602</td>\n",
              "      <td>2019.0</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>C00712851</td>\n",
              "      <td>O</td>\n",
              "      <td>U</td>\n",
              "      <td>228963.0</td>\n",
              "      <td>C00000059</td>\n",
              "      <td>HALLMARK CARDS PAC</td>\n",
              "      <td>SARAH MOE</td>\n",
              "      <td>2501 MCGEE</td>\n",
              "      <td>MD #500</td>\n",
              "      <td>KANSAS CITY</td>\n",
              "      <td>MO</td>\n",
              "      <td>64108</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>UNK</td>\n",
              "      <td>M</td>\n",
              "      <td>C</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>H0AK00113</td>\n",
              "      <td>TUGATUK, RAY SEAN</td>\n",
              "      <td>DEM</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>PO BOX 172</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MANAKOTAK</td>\n",
              "      <td>AK</td>\n",
              "      <td>99628.0</td>\n",
              "      <td>H0AK00105</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>C00607515</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>229250.0</td>\n",
              "      <td>C00000422</td>\n",
              "      <td>AMERICAN MEDICAL ASSOCIATION POLITICAL ACTION ...</td>\n",
              "      <td>WALKER, KEVIN MR.</td>\n",
              "      <td>25 MASSACHUSETTS AVE, NW</td>\n",
              "      <td>SUITE 600</td>\n",
              "      <td>WASHINGTON</td>\n",
              "      <td>DC</td>\n",
              "      <td>200017400</td>\n",
              "      <td>B</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>M</td>\n",
              "      <td>M</td>\n",
              "      <td>DELAWARE MEDICAL PAC</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>H0AK01046</td>\n",
              "      <td>CATALANO, THOMAS</td>\n",
              "      <td>OTH</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>NaN</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>188 WEST NORTHERN LIGHTS BOULEVARD</td>\n",
              "      <td>NaN</td>\n",
              "      <td>ANCHORAGE</td>\n",
              "      <td>AK</td>\n",
              "      <td>99503.0</td>\n",
              "      <td>H0AL01055</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>C00697789</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>226125.0</td>\n",
              "      <td>C00000489</td>\n",
              "      <td>D R I V E POLITICAL FUND CHAPTER 886</td>\n",
              "      <td>JERRY SIMS JR</td>\n",
              "      <td>3528 W RENO</td>\n",
              "      <td>NaN</td>\n",
              "      <td>OKLAHOMA CITY</td>\n",
              "      <td>OK</td>\n",
              "      <td>73107</td>\n",
              "      <td>U</td>\n",
              "      <td>N</td>\n",
              "      <td>NaN</td>\n",
              "      <td>Q</td>\n",
              "      <td>L</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>H0AL01055</td>\n",
              "      <td>CARL, JERRY LEE, JR</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00697789</td>\n",
              "      <td>PO BOX 852138</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MOBILE</td>\n",
              "      <td>AL</td>\n",
              "      <td>36685.0</td>\n",
              "      <td>H0AL01063</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>C00701557</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227053.0</td>\n",
              "      <td>C00000547</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY POLITICAL ACTION COMMITTEE</td>\n",
              "      <td>JERRY SLAUGHTER</td>\n",
              "      <td>623 SW 10TH AVE</td>\n",
              "      <td>NaN</td>\n",
              "      <td>TOPEKA</td>\n",
              "      <td>KS</td>\n",
              "      <td>666121627</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>UNK</td>\n",
              "      <td>Q</td>\n",
              "      <td>M</td>\n",
              "      <td>KANSAS MEDICAL SOCIETY</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>H0AL01063</td>\n",
              "      <td>LAMBERT, DOUGLAS WESTLEY III</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00701557</td>\n",
              "      <td>7194 STILLWATER BLVD</td>\n",
              "      <td>NaN</td>\n",
              "      <td>SPANISH FORT</td>\n",
              "      <td>AL</td>\n",
              "      <td>36527.0</td>\n",
              "      <td>H0AL01071</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>2020.0</td>\n",
              "      <td>C00701409</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227054.0</td>\n",
              "      <td>C00000638</td>\n",
              "      <td>INDIANA STATE MEDICAL ASSOCIATION POLITICAL AC...</td>\n",
              "      <td>ACHENBACH, GRANT MR.</td>\n",
              "      <td>322 CANAL WALK, CANAL LEVEL</td>\n",
              "      <td>NaN</td>\n",
              "      <td>INDIANAPOLIS</td>\n",
              "      <td>IN</td>\n",
              "      <td>46202</td>\n",
              "      <td>U</td>\n",
              "      <td>Q</td>\n",
              "      <td>NaN</td>\n",
              "      <td>T</td>\n",
              "      <td>M</td>\n",
              "      <td>NaN</td>\n",
              "      <td>NaN</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   index    CAND_ID  ...        CONNECTED_ORG_NM CAND_ID\n",
              "0      0  H0AK00105  ...                     NaN     NaN\n",
              "1      1  H0AK00113  ...    DELAWARE MEDICAL PAC     NaN\n",
              "2      2  H0AK01046  ...                     NaN     NaN\n",
              "3      3  H0AL01055  ...  KANSAS MEDICAL SOCIETY     NaN\n",
              "4      4  H0AL01063  ...                     NaN     NaN\n",
              "\n",
              "[5 rows x 38 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 119
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "d_oy0b5jT8eY",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 333
        },
        "outputId": "6650d9ab-983e-4277-843d-157107978f50"
      },
      "source": [
        "df2 = pd.merge(candidates, linkage, on='CAND_ID')\n",
        "df2.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_PTY_AFFILIATION</th>\n",
              "      <th>CAND_ELECTION_YR_x</th>\n",
              "      <th>CAND_OFFICE_ST</th>\n",
              "      <th>CAND_OFFICE</th>\n",
              "      <th>CAND_OFFICE_DISTRICT</th>\n",
              "      <th>CAND_ICI</th>\n",
              "      <th>CAND_STATUS</th>\n",
              "      <th>CAND_PCC</th>\n",
              "      <th>CAND_ST1</th>\n",
              "      <th>CAND_ST2</th>\n",
              "      <th>CAND_CITY</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CAND_ZIP</th>\n",
              "      <th>CAND_ELECTION_YR_y</th>\n",
              "      <th>FEC_ELECTION_YR</th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>CMTE_TP</th>\n",
              "      <th>CMTE_DSGN</th>\n",
              "      <th>LINKAGE_ID</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>H0AK00105</td>\n",
              "      <td>LAMB, THOMAS</td>\n",
              "      <td>NNE</td>\n",
              "      <td>2020</td>\n",
              "      <td>AK</td>\n",
              "      <td>H</td>\n",
              "      <td>0.0</td>\n",
              "      <td>C</td>\n",
              "      <td>N</td>\n",
              "      <td>C00607515</td>\n",
              "      <td>1861 W LAKE LUCILLE DR</td>\n",
              "      <td>NaN</td>\n",
              "      <td>WASILLA</td>\n",
              "      <td>AK</td>\n",
              "      <td>99654.0</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00607515</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>229250</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>H0AL01055</td>\n",
              "      <td>CARL, JERRY LEE, JR</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00697789</td>\n",
              "      <td>PO BOX 852138</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MOBILE</td>\n",
              "      <td>AL</td>\n",
              "      <td>36685.0</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00697789</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>226125</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>H0AL01063</td>\n",
              "      <td>LAMBERT, DOUGLAS WESTLEY III</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00701557</td>\n",
              "      <td>7194 STILLWATER BLVD</td>\n",
              "      <td>NaN</td>\n",
              "      <td>SPANISH FORT</td>\n",
              "      <td>AL</td>\n",
              "      <td>36527.0</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00701557</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227053</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>H0AL01071</td>\n",
              "      <td>PRINGLE, CHRISTOPHER PAUL</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00701409</td>\n",
              "      <td>202 GOVERNMENT STREET</td>\n",
              "      <td>SUITE 220</td>\n",
              "      <td>MOBILE</td>\n",
              "      <td>AL</td>\n",
              "      <td>36602.0</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00701409</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227054</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>H0AL01089</td>\n",
              "      <td>HIGHTOWER, BILL</td>\n",
              "      <td>REP</td>\n",
              "      <td>2020</td>\n",
              "      <td>AL</td>\n",
              "      <td>H</td>\n",
              "      <td>1.0</td>\n",
              "      <td>O</td>\n",
              "      <td>C</td>\n",
              "      <td>C00703066</td>\n",
              "      <td>PO BOX 91038</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MOBILE</td>\n",
              "      <td>AL</td>\n",
              "      <td>36691.0</td>\n",
              "      <td>2020</td>\n",
              "      <td>2020</td>\n",
              "      <td>C00703066</td>\n",
              "      <td>H</td>\n",
              "      <td>P</td>\n",
              "      <td>227266</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CAND_ID                     CAND_NAME  ... CMTE_DSGN  LINKAGE_ID\n",
              "0  H0AK00105                  LAMB, THOMAS  ...         P      229250\n",
              "1  H0AL01055           CARL, JERRY LEE, JR  ...         P      226125\n",
              "2  H0AL01063  LAMBERT, DOUGLAS WESTLEY III  ...         P      227053\n",
              "3  H0AL01071     PRINGLE, CHRISTOPHER PAUL  ...         P      227054\n",
              "4  H0AL01089               HIGHTOWER, BILL  ...         P      227266\n",
              "\n",
              "[5 rows x 21 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 120
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "5pJTEGPm0OXR"
      },
      "source": [
        "df_merge = pd.merge(df2, df_name, on='CMTE_ID')\n",
        "#df_merge.head()\n",
        "\n",
        "df_sort = df_merge[['CAND_ID', 'CAND_NAME', 'CAND_ST', 'CMTE_NM']]"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "jAIGmy572dYD",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 343
        },
        "outputId": "bbeade85-9d03-4b76-88d7-ba6ecf5ec6f0"
      },
      "source": [
        "df_sort[df_sort['CAND_ST']=='FL'].head(10)"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CMTE_NM</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>H0AL01113</td>\n",
              "      <td>ROMAGNANO, CHASE ANDERSON ANDY REVEREN</td>\n",
              "      <td>FL</td>\n",
              "      <td>ANDY FOR NORTHWEST FLORIDA</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>H0FL01120</td>\n",
              "      <td>ROMAGNANO, CHASE ANDERSON ANDY REV.</td>\n",
              "      <td>FL</td>\n",
              "      <td>ANDY FOR NORTHWEST FLORIDA</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>14</th>\n",
              "      <td>H0FL02110</td>\n",
              "      <td>SOUTHERLAND, WILLIAM STEVE II</td>\n",
              "      <td>FL</td>\n",
              "      <td>FRESHMAN AGRICULTURAL REPUBLICAN MEMBERS TRUST...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>381</th>\n",
              "      <td>H0FL01120</td>\n",
              "      <td>ROMAGNANO, CHASE ANDERSON ANDY REV.</td>\n",
              "      <td>FL</td>\n",
              "      <td>ANDY ROMAGNANO FOR NORTHWEST FLORIDA</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>382</th>\n",
              "      <td>H0FL01138</td>\n",
              "      <td>HOWARD, CHERYL</td>\n",
              "      <td>FL</td>\n",
              "      <td>CHERYL HOWARD FOR U S HOUSE OF REPRESENTATIVES</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>383</th>\n",
              "      <td>H0FL01146</td>\n",
              "      <td>MERK, GREGORY CHARLES</td>\n",
              "      <td>FL</td>\n",
              "      <td>GREG MERK CAMPAIGN</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>384</th>\n",
              "      <td>H0FL01153</td>\n",
              "      <td>ORAM, ALBERT</td>\n",
              "      <td>FL</td>\n",
              "      <td>ORAM FOR CONGRESS</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>386</th>\n",
              "      <td>H0FL02193</td>\n",
              "      <td>ANDERSON, WILLIE JEFFERSON JR</td>\n",
              "      <td>FL</td>\n",
              "      <td>ANDERSON FOR US CONGRESS</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>387</th>\n",
              "      <td>H0FL02219</td>\n",
              "      <td>THRIPP, KRISTY</td>\n",
              "      <td>FL</td>\n",
              "      <td>KRISTY THRIPP FOR CONGRESS</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>388</th>\n",
              "      <td>H0FL03118</td>\n",
              "      <td>BLACK, DEAN A</td>\n",
              "      <td>FL</td>\n",
              "      <td>DEAN BLACK FOR CONGRESS</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "       CAND_ID  ...                                            CMTE_NM\n",
              "7    H0AL01113  ...                         ANDY FOR NORTHWEST FLORIDA\n",
              "8    H0FL01120  ...                         ANDY FOR NORTHWEST FLORIDA\n",
              "14   H0FL02110  ...  FRESHMAN AGRICULTURAL REPUBLICAN MEMBERS TRUST...\n",
              "381  H0FL01120  ...               ANDY ROMAGNANO FOR NORTHWEST FLORIDA\n",
              "382  H0FL01138  ...     CHERYL HOWARD FOR U S HOUSE OF REPRESENTATIVES\n",
              "383  H0FL01146  ...                                 GREG MERK CAMPAIGN\n",
              "384  H0FL01153  ...                                  ORAM FOR CONGRESS\n",
              "386  H0FL02193  ...                           ANDERSON FOR US CONGRESS\n",
              "387  H0FL02219  ...                         KRISTY THRIPP FOR CONGRESS\n",
              "388  H0FL03118  ...                            DEAN BLACK FOR CONGRESS\n",
              "\n",
              "[10 rows x 4 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 122
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "geYa-uW_34Aq",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        },
        "outputId": "8ec71abb-0aa5-42cf-a6f4-41716b07891e"
      },
      "source": [
        "df_sort_house = pd.merge(df_sort, df_house, on='CAND_ID')\n",
        "df_sort_house.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>CAND_ID</th>\n",
              "      <th>CAND_NAME</th>\n",
              "      <th>CAND_ST</th>\n",
              "      <th>CMTE_NM</th>\n",
              "      <th>CMTE_ID</th>\n",
              "      <th>NAME</th>\n",
              "      <th>STATE</th>\n",
              "      <th>TRANSACTION_AMT</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>H0AL02087</td>\n",
              "      <td>ROBY, MARTHA</td>\n",
              "      <td>NaN</td>\n",
              "      <td>MARTHA ROBY FOR CONGRESS</td>\n",
              "      <td>C00522458</td>\n",
              "      <td>PINPOINT WEBSOLUTIONS</td>\n",
              "      <td>FL</td>\n",
              "      <td>750</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>H0AL02087</td>\n",
              "      <td>ROBY, MARTHA</td>\n",
              "      <td>NaN</td>\n",
              "      <td>FRESHMAN AGRICULTURAL REPUBLICAN MEMBERS TRUST...</td>\n",
              "      <td>C00522458</td>\n",
              "      <td>PINPOINT WEBSOLUTIONS</td>\n",
              "      <td>FL</td>\n",
              "      <td>750</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>H0GA08099</td>\n",
              "      <td>SCOTT, JAMES AUSTIN</td>\n",
              "      <td>GA</td>\n",
              "      <td>FRESHMAN AGRICULTURAL REPUBLICAN MEMBERS TRUST...</td>\n",
              "      <td>C00522458</td>\n",
              "      <td>PINPOINT WEBSOLUTIONS</td>\n",
              "      <td>FL</td>\n",
              "      <td>750</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>H0GA08099</td>\n",
              "      <td>SCOTT, JAMES AUSTIN</td>\n",
              "      <td>GA</td>\n",
              "      <td>AUSTIN SCOTT FOR CONGRESS INC</td>\n",
              "      <td>C00522458</td>\n",
              "      <td>PINPOINT WEBSOLUTIONS</td>\n",
              "      <td>FL</td>\n",
              "      <td>750</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>H0GA08099</td>\n",
              "      <td>SCOTT, JAMES AUSTIN</td>\n",
              "      <td>GA</td>\n",
              "      <td>AUSTIN SCOTT VICTORY FUND</td>\n",
              "      <td>C00522458</td>\n",
              "      <td>PINPOINT WEBSOLUTIONS</td>\n",
              "      <td>FL</td>\n",
              "      <td>750</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "     CAND_ID            CAND_NAME  ... STATE TRANSACTION_AMT\n",
              "0  H0AL02087         ROBY, MARTHA  ...    FL             750\n",
              "1  H0AL02087         ROBY, MARTHA  ...    FL             750\n",
              "2  H0GA08099  SCOTT, JAMES AUSTIN  ...    FL             750\n",
              "3  H0GA08099  SCOTT, JAMES AUSTIN  ...    FL             750\n",
              "4  H0GA08099  SCOTT, JAMES AUSTIN  ...    FL             750\n",
              "\n",
              "[5 rows x 8 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 123
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hRa7Rywa-Rby",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "71f63e33-c35c-413b-c882-c3863e073ba0"
      },
      "source": [
        "df_sort_house['TRANSACTION_AMT'].sum()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "17474528"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 124
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "lJ51BCl5YHBM"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}