{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analyzing Startup Fundraising Deals from Crunchbase"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "To get familiar with the data, let's create a helper funtion that can read from a given csv file, a specified number of rows using the right encoding and print them as a table."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "csvfile = 'crunchbase-investments.csv'\n",
    "def pdread_csv(filename, numrows, enc='ISO-8859-1'):\n",
    "    rows = pd.read_csv( filename, nrows=numrows, encoding=enc)\n",
    "    return rows\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read a few rows from the csv file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>company_permalink</th>\n",
       "      <th>company_name</th>\n",
       "      <th>company_category_code</th>\n",
       "      <th>company_country_code</th>\n",
       "      <th>company_state_code</th>\n",
       "      <th>company_region</th>\n",
       "      <th>company_city</th>\n",
       "      <th>investor_permalink</th>\n",
       "      <th>investor_name</th>\n",
       "      <th>investor_category_code</th>\n",
       "      <th>investor_country_code</th>\n",
       "      <th>investor_state_code</th>\n",
       "      <th>investor_region</th>\n",
       "      <th>investor_city</th>\n",
       "      <th>funding_round_type</th>\n",
       "      <th>funded_at</th>\n",
       "      <th>funded_month</th>\n",
       "      <th>funded_quarter</th>\n",
       "      <th>funded_year</th>\n",
       "      <th>raised_amount_usd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>/company/advercar</td>\n",
       "      <td>AdverCar</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>/company/1-800-flowers-com</td>\n",
       "      <td>1-800-FLOWERS.COM</td>\n",
       "      <td>NaN</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-10-30</td>\n",
       "      <td>2012-10</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>2000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>/company/launchgram</td>\n",
       "      <td>LaunchGram</td>\n",
       "      <td>news</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Mountain View</td>\n",
       "      <td>/company/10xelerator</td>\n",
       "      <td>10Xelerator</td>\n",
       "      <td>finance</td>\n",
       "      <td>USA</td>\n",
       "      <td>OH</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>other</td>\n",
       "      <td>2012-01-23</td>\n",
       "      <td>2012-01</td>\n",
       "      <td>2012-Q1</td>\n",
       "      <td>2012</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>/company/utap</td>\n",
       "      <td>uTaP</td>\n",
       "      <td>messaging</td>\n",
       "      <td>USA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>United States - Other</td>\n",
       "      <td>NaN</td>\n",
       "      <td>/company/10xelerator</td>\n",
       "      <td>10Xelerator</td>\n",
       "      <td>finance</td>\n",
       "      <td>USA</td>\n",
       "      <td>OH</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>other</td>\n",
       "      <td>2012-01-01</td>\n",
       "      <td>2012-01</td>\n",
       "      <td>2012-Q1</td>\n",
       "      <td>2012</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>/company/zoopshop</td>\n",
       "      <td>ZoopShop</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>OH</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>columbus</td>\n",
       "      <td>/company/10xelerator</td>\n",
       "      <td>10Xelerator</td>\n",
       "      <td>finance</td>\n",
       "      <td>USA</td>\n",
       "      <td>OH</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-02-15</td>\n",
       "      <td>2012-02</td>\n",
       "      <td>2012-Q1</td>\n",
       "      <td>2012</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>/company/efuneral</td>\n",
       "      <td>eFuneral</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>OH</td>\n",
       "      <td>Cleveland</td>\n",
       "      <td>Cleveland</td>\n",
       "      <td>/company/10xelerator</td>\n",
       "      <td>10Xelerator</td>\n",
       "      <td>finance</td>\n",
       "      <td>USA</td>\n",
       "      <td>OH</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>other</td>\n",
       "      <td>2011-09-08</td>\n",
       "      <td>2011-09</td>\n",
       "      <td>2011-Q3</td>\n",
       "      <td>2011</td>\n",
       "      <td>20000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     company_permalink company_name company_category_code  \\\n",
       "0    /company/advercar     AdverCar           advertising   \n",
       "1  /company/launchgram   LaunchGram                  news   \n",
       "2        /company/utap         uTaP             messaging   \n",
       "3    /company/zoopshop     ZoopShop              software   \n",
       "4    /company/efuneral     eFuneral                   web   \n",
       "\n",
       "  company_country_code company_state_code         company_region  \\\n",
       "0                  USA                 CA                 SF Bay   \n",
       "1                  USA                 CA                 SF Bay   \n",
       "2                  USA                NaN  United States - Other   \n",
       "3                  USA                 OH               Columbus   \n",
       "4                  USA                 OH              Cleveland   \n",
       "\n",
       "    company_city          investor_permalink      investor_name  \\\n",
       "0  San Francisco  /company/1-800-flowers-com  1-800-FLOWERS.COM   \n",
       "1  Mountain View        /company/10xelerator        10Xelerator   \n",
       "2            NaN        /company/10xelerator        10Xelerator   \n",
       "3       columbus        /company/10xelerator        10Xelerator   \n",
       "4      Cleveland        /company/10xelerator        10Xelerator   \n",
       "\n",
       "  investor_category_code investor_country_code investor_state_code  \\\n",
       "0                    NaN                   USA                  NY   \n",
       "1                finance                   USA                  OH   \n",
       "2                finance                   USA                  OH   \n",
       "3                finance                   USA                  OH   \n",
       "4                finance                   USA                  OH   \n",
       "\n",
       "  investor_region investor_city funding_round_type   funded_at funded_month  \\\n",
       "0        New York      New York           series-a  2012-10-30      2012-10   \n",
       "1        Columbus      Columbus              other  2012-01-23      2012-01   \n",
       "2        Columbus      Columbus              other  2012-01-01      2012-01   \n",
       "3        Columbus      Columbus              angel  2012-02-15      2012-02   \n",
       "4        Columbus      Columbus              other  2011-09-08      2011-09   \n",
       "\n",
       "  funded_quarter  funded_year  raised_amount_usd  \n",
       "0        2012-Q4         2012            2000000  \n",
       "1        2012-Q1         2012              20000  \n",
       "2        2012-Q1         2012              20000  \n",
       "3        2012-Q1         2012              20000  \n",
       "4        2011-Q3         2011              20000  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pdread_csv(csvfile, 5)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Total memory usage (in MB) for 5000 rows"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5.579195022583008"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pdread_csv(csvfile, 5000).memory_usage(deep=True).sum()/1024**2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Total memory footprint of the dataset\n",
    "\n",
    "As memory consumption for 5000 rows is much less than 10 MB, let's read the data into dataframes using 5,000 row chunks to ensure that each chunk consumes much less than 10 MB of memory. We will then calculate total memory footprint of all of the chunks combined.\n",
    "\n",
    "Let's define a function for this.\n",
    "The function will consume a chunk iterator and return the total memory footprint of the full dataset, in MB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def all_chunks_mem(chunk_iter):\n",
    "    total_memory_footprint = 0\n",
    "    for chunk in chunk_iter:\n",
    "        total_memory_footprint += chunk.memory_usage(deep=True).sum()/1024**2\n",
    "    return total_memory_footprint"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- Create an iterator using pandas.read_csv() that will process chunks of 5000 rows from the csv file at a time.\n",
    "- Call **all_chunks_mem** function with the iterator to give us the total memory footprint of the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "56.988484382629395"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "all_chunks_mem(crunch_chunks)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This dataset is believed to contain over 50,000 rows. Let's find out how many rows and columns there are:\n",
    "\n",
    "- For total number of rows in the dataset, add up number of rows across all of the chunks.\n",
    "- For total number of columns, count number of columns in any one chunk.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "52870\n",
      "20\n"
     ]
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "total_rows = 0\n",
    "for chunk in crunch_chunks:\n",
    "    total_rows += chunk.shape[0]\n",
    "print (total_rows)\n",
    "print (chunk.shape[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Memory usage of each column in bytes\n",
    "\n",
    "While calculating this, we will exclude the memory usage of the DataFrame’s index.\n",
    "\n",
    "- We will chunkify the dataset as before, using 5000 rows at a time\n",
    "\n",
    "- We will form a list of column names to iterate over in each chunk\n",
    "\n",
    "- We will use a dictionary to count the memory footprint of each column. The dictionary keys will be column names and values will be the memory usage per column\n",
    "\n",
    "Sum of them all in MB should match the dataset's total memory footprint( or that of all chunks combined ) calculated earlier."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "investor_category_code 622424\n",
      "funded_month 3383584\n",
      "company_state_code 3106051\n",
      "company_category_code 3421104\n",
      "funded_year 422960\n",
      "investor_permalink 4980548\n",
      "company_name 3591326\n",
      "company_city 3505926\n",
      "funding_round_type 3410707\n",
      "investor_state_code 2476607\n",
      "funded_at 3542185\n",
      "investor_region 3396281\n",
      "company_permalink 4057788\n",
      "company_region 3411585\n",
      "company_country_code 3172176\n",
      "investor_city 2885083\n",
      "funded_quarter 3383584\n",
      "investor_name 3915666\n",
      "investor_country_code 2647292\n",
      "raised_amount_usd 422960\n",
      "\n",
      " Total memory Usage of All Columns(in MB): 56.988\n"
     ]
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "col_mem = {}\n",
    "for chunk in crunch_chunks:\n",
    "    cols = list(chunk.columns)\n",
    "    for col in cols:\n",
    "        if col in col_mem:\n",
    "            col_mem[col] += chunk[col].memory_usage(index=False,deep=True)\n",
    "        else:\n",
    "            col_mem[col] = chunk[col].memory_usage(index=False,deep=True)\n",
    "\n",
    "total_mem = 0\n",
    "for key in col_mem:\n",
    "    print (key,col_mem[key])\n",
    "    total_mem += col_mem[key]\n",
    "    \n",
    "print ('\\n','Total memory Usage of All Columns(in MB):', round(total_mem/1024**2,3))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Each column's missing value counts\n",
    "\n",
    "- We will calculate the sum of pd.isnull() on each chunk to tell us the number of missing values for each column\n",
    "- We will need to combine them together. Before doing that, we will append each iteration's result to a list. \n",
    "- Once list is ready for all chunks, we will use pd.concat(list) to combine them into a single series object\n",
    "- Then we group all values associated with each *index* by using groupby() on the above result\n",
    "- Once grouped by their corresponding indexes, we will use the Series.sum() method to return the final, unique value counts(missing values)\n",
    "- We finally display the result by sorted_values (in ascending order by default)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "company_country_code          1\n",
       "company_name                  1\n",
       "company_permalink             1\n",
       "company_region                1\n",
       "investor_region               2\n",
       "investor_permalink            2\n",
       "investor_name                 2\n",
       "funded_quarter                3\n",
       "funded_at                     3\n",
       "funded_month                  3\n",
       "funded_year                   3\n",
       "funding_round_type            3\n",
       "company_state_code          492\n",
       "company_city                533\n",
       "company_category_code       643\n",
       "raised_amount_usd          3599\n",
       "investor_country_code     12001\n",
       "investor_city             12480\n",
       "investor_state_code       16809\n",
       "investor_category_code    50427\n",
       "dtype: int64"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "missing = []\n",
    "for chunk in crunch_chunks:\n",
    "    missing.append(pd.isnull(chunk).sum())\n",
    "combined = pd.concat(missing)\n",
    "final = (combined.groupby(combined.index)).sum()\n",
    "final.sort_values()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Which column(s) can we drop because they aren't useful for analysis?\n",
    "\n",
    "- Just by looking at the per column missing values, the column *investor_category_code* has way too many missing i.e 50427 out of 52870 row values, to be useful for analysis.\n",
    "\n",
    "- Links to websites are also not useful for analysis - *company_permalink* and *investor_permalink* . We will use df.drop to drop this list of columns, resulting in 17 columns in the dataframe."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17\n",
      "['company_name', 'company_category_code', 'company_country_code', 'company_state_code', 'company_region', 'company_city', 'investor_name', 'investor_country_code', 'investor_state_code', 'investor_region', 'investor_city', 'funding_round_type', 'funded_at', 'funded_month', 'funded_quarter', 'funded_year', 'raised_amount_usd']\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>company_name</th>\n",
       "      <th>company_category_code</th>\n",
       "      <th>company_country_code</th>\n",
       "      <th>company_state_code</th>\n",
       "      <th>company_region</th>\n",
       "      <th>company_city</th>\n",
       "      <th>investor_name</th>\n",
       "      <th>investor_country_code</th>\n",
       "      <th>investor_state_code</th>\n",
       "      <th>investor_region</th>\n",
       "      <th>investor_city</th>\n",
       "      <th>funding_round_type</th>\n",
       "      <th>funded_at</th>\n",
       "      <th>funded_month</th>\n",
       "      <th>funded_quarter</th>\n",
       "      <th>funded_year</th>\n",
       "      <th>raised_amount_usd</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>50000</th>\n",
       "      <td>NuORDER</td>\n",
       "      <td>fashion</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>West Hollywood</td>\n",
       "      <td>Mortimer Singer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-10-01</td>\n",
       "      <td>2012-10</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>3060000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50001</th>\n",
       "      <td>ChaCha</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>IN</td>\n",
       "      <td>Indianapolis</td>\n",
       "      <td>Carmel</td>\n",
       "      <td>Morton Meyerson</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-b</td>\n",
       "      <td>2007-10-01</td>\n",
       "      <td>2007-10</td>\n",
       "      <td>2007-Q4</td>\n",
       "      <td>2007</td>\n",
       "      <td>12000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50002</th>\n",
       "      <td>Binfire</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>FL</td>\n",
       "      <td>Bocat Raton</td>\n",
       "      <td>Bocat Raton</td>\n",
       "      <td>Moshe Ariel</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2008-04-18</td>\n",
       "      <td>2008-04</td>\n",
       "      <td>2008-Q2</td>\n",
       "      <td>2008</td>\n",
       "      <td>500000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50003</th>\n",
       "      <td>Binfire</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>FL</td>\n",
       "      <td>Bocat Raton</td>\n",
       "      <td>Bocat Raton</td>\n",
       "      <td>Moshe Ariel</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2010-01-01</td>\n",
       "      <td>2010-01</td>\n",
       "      <td>2010-Q1</td>\n",
       "      <td>2010</td>\n",
       "      <td>750000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50004</th>\n",
       "      <td>Unified Color</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>South San Frnacisco</td>\n",
       "      <td>Mr. Andrew Oung</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2010-01-01</td>\n",
       "      <td>2010-01</td>\n",
       "      <td>2010-Q1</td>\n",
       "      <td>2010</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50005</th>\n",
       "      <td>HItviews</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York City</td>\n",
       "      <td>multiple parties</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2007-11-29</td>\n",
       "      <td>2007-11</td>\n",
       "      <td>2007-Q4</td>\n",
       "      <td>2007</td>\n",
       "      <td>485000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50006</th>\n",
       "      <td>LockerDome</td>\n",
       "      <td>social</td>\n",
       "      <td>USA</td>\n",
       "      <td>MO</td>\n",
       "      <td>Saint Louis</td>\n",
       "      <td>St. Louis</td>\n",
       "      <td>multiple parties</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-04-17</td>\n",
       "      <td>2012-04</td>\n",
       "      <td>2012-Q2</td>\n",
       "      <td>2012</td>\n",
       "      <td>300000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50007</th>\n",
       "      <td>ThirdLove</td>\n",
       "      <td>ecommerce</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Munjal Shah</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-12-01</td>\n",
       "      <td>2012-12</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>5600000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50008</th>\n",
       "      <td>Hakia</td>\n",
       "      <td>search</td>\n",
       "      <td>USA</td>\n",
       "      <td>NaN</td>\n",
       "      <td>TBD</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Murat Vargi</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2006-11-01</td>\n",
       "      <td>2006-11</td>\n",
       "      <td>2006-Q4</td>\n",
       "      <td>2006</td>\n",
       "      <td>16000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50009</th>\n",
       "      <td>bookacoach</td>\n",
       "      <td>sports</td>\n",
       "      <td>USA</td>\n",
       "      <td>IN</td>\n",
       "      <td>Indianapolis</td>\n",
       "      <td>Indianapolis</td>\n",
       "      <td>Myles Grote</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-11-01</td>\n",
       "      <td>2012-11</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50010</th>\n",
       "      <td>LocalCircles</td>\n",
       "      <td>social</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Santa Clara</td>\n",
       "      <td>Nadir Godrej</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-09-01</td>\n",
       "      <td>2012-09</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50011</th>\n",
       "      <td>Graphdive</td>\n",
       "      <td>analytics</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Menlo Park</td>\n",
       "      <td>Naguib Sawiris</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-10-04</td>\n",
       "      <td>2012-10</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>1000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50012</th>\n",
       "      <td>Ribbon</td>\n",
       "      <td>ecommerce</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Naguib Sawiris</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-02-05</td>\n",
       "      <td>2013-02</td>\n",
       "      <td>2013-Q1</td>\n",
       "      <td>2013</td>\n",
       "      <td>1630000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50013</th>\n",
       "      <td>Dokkankom.com</td>\n",
       "      <td>ecommerce</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>new york</td>\n",
       "      <td>Namek Zu'bi</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-10-10</td>\n",
       "      <td>2011-10</td>\n",
       "      <td>2011-Q4</td>\n",
       "      <td>2011</td>\n",
       "      <td>30000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50014</th>\n",
       "      <td>Lookery</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Nana Shin</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2008-02-07</td>\n",
       "      <td>2008-02</td>\n",
       "      <td>2008-Q1</td>\n",
       "      <td>2008</td>\n",
       "      <td>900000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50015</th>\n",
       "      <td>TrustDegrees</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>Kenmore</td>\n",
       "      <td>Kenmore</td>\n",
       "      <td>Nancy Barrett</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-06-09</td>\n",
       "      <td>2011-06</td>\n",
       "      <td>2011-Q2</td>\n",
       "      <td>2011</td>\n",
       "      <td>8000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50016</th>\n",
       "      <td>Altavoz</td>\n",
       "      <td>games_video</td>\n",
       "      <td>USA</td>\n",
       "      <td>DC</td>\n",
       "      <td>Washington DC</td>\n",
       "      <td>Washington</td>\n",
       "      <td>Nancy Jacobsen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-09-11</td>\n",
       "      <td>2012-09</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>150000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50017</th>\n",
       "      <td>EdSurge</td>\n",
       "      <td>education</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Burlingame</td>\n",
       "      <td>Nancy Peretsman</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-08-29</td>\n",
       "      <td>2012-08</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50018</th>\n",
       "      <td>FullContact</td>\n",
       "      <td>enterprise</td>\n",
       "      <td>USA</td>\n",
       "      <td>CO</td>\n",
       "      <td>Denver</td>\n",
       "      <td>Denver</td>\n",
       "      <td>Nancy Pierce</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-b</td>\n",
       "      <td>2012-07-09</td>\n",
       "      <td>2012-07</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>7000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50019</th>\n",
       "      <td>Rapt Media</td>\n",
       "      <td>enterprise</td>\n",
       "      <td>USA</td>\n",
       "      <td>CO</td>\n",
       "      <td>Denver</td>\n",
       "      <td>Boulder</td>\n",
       "      <td>Nancy Pierce</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-01-23</td>\n",
       "      <td>2013-01</td>\n",
       "      <td>2013-Q1</td>\n",
       "      <td>2013</td>\n",
       "      <td>2288803.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50020</th>\n",
       "      <td>Humanoid</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Nat Friedman</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2010-12-01</td>\n",
       "      <td>2010-12</td>\n",
       "      <td>2010-Q4</td>\n",
       "      <td>2010</td>\n",
       "      <td>1100000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50021</th>\n",
       "      <td>Runscope</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Nat Friedman</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2013-05-22</td>\n",
       "      <td>2013-05</td>\n",
       "      <td>2013-Q2</td>\n",
       "      <td>2013</td>\n",
       "      <td>1100000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50022</th>\n",
       "      <td>Adzerk</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>NC</td>\n",
       "      <td>Raleigh-Durham</td>\n",
       "      <td>Durham</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-07-12</td>\n",
       "      <td>2011-07</td>\n",
       "      <td>2011-Q3</td>\n",
       "      <td>2011</td>\n",
       "      <td>650000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50023</th>\n",
       "      <td>Adaptly</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2011-04-18</td>\n",
       "      <td>2011-04</td>\n",
       "      <td>2011-Q2</td>\n",
       "      <td>2011</td>\n",
       "      <td>2000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50024</th>\n",
       "      <td>Lore</td>\n",
       "      <td>education</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-06-27</td>\n",
       "      <td>2011-06</td>\n",
       "      <td>2011-Q2</td>\n",
       "      <td>2011</td>\n",
       "      <td>1000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50025</th>\n",
       "      <td>Tasted Menu</td>\n",
       "      <td>hospitality</td>\n",
       "      <td>USA</td>\n",
       "      <td>MA</td>\n",
       "      <td>Boston</td>\n",
       "      <td>Boston</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-05-01</td>\n",
       "      <td>2011-05</td>\n",
       "      <td>2011-Q2</td>\n",
       "      <td>2011</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50026</th>\n",
       "      <td>Lua Technologies</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-08-01</td>\n",
       "      <td>2012-08</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>2500000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50027</th>\n",
       "      <td>Blue Apron</td>\n",
       "      <td>hospitality</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>Brooklyn</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-02-19</td>\n",
       "      <td>2013-02</td>\n",
       "      <td>2013-Q1</td>\n",
       "      <td>2013</td>\n",
       "      <td>3000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50028</th>\n",
       "      <td>ChatID</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-01-01</td>\n",
       "      <td>2012-01</td>\n",
       "      <td>2012-Q1</td>\n",
       "      <td>2012</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50029</th>\n",
       "      <td>Breakthrough Behavioral</td>\n",
       "      <td>health</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Redwood City</td>\n",
       "      <td>Nat Turner</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-08-13</td>\n",
       "      <td>2012-08</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>900000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52840</th>\n",
       "      <td>Meddik</td>\n",
       "      <td>health</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Zach Weinberg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-05-24</td>\n",
       "      <td>2012-05</td>\n",
       "      <td>2012-Q2</td>\n",
       "      <td>2012</td>\n",
       "      <td>750000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52841</th>\n",
       "      <td>Blue Apron</td>\n",
       "      <td>hospitality</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>Brooklyn</td>\n",
       "      <td>Zach Weinberg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-02-19</td>\n",
       "      <td>2013-02</td>\n",
       "      <td>2013-Q1</td>\n",
       "      <td>2013</td>\n",
       "      <td>3000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52842</th>\n",
       "      <td>ChatID</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Zach Weinberg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-01-01</td>\n",
       "      <td>2012-01</td>\n",
       "      <td>2012-Q1</td>\n",
       "      <td>2012</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52843</th>\n",
       "      <td>Breakthrough Behavioral</td>\n",
       "      <td>health</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Redwood City</td>\n",
       "      <td>Zach Weinberg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-08-13</td>\n",
       "      <td>2012-08</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>900000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52844</th>\n",
       "      <td>Plaid</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zach Weinberg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-09-19</td>\n",
       "      <td>2013-09</td>\n",
       "      <td>2013-Q3</td>\n",
       "      <td>2013</td>\n",
       "      <td>2800000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52845</th>\n",
       "      <td>PokitDok</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Menlo Park</td>\n",
       "      <td>Zach Zeitlin</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-07-12</td>\n",
       "      <td>2012-07</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>1300000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52846</th>\n",
       "      <td>Fitocracy</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Zachary Aarons</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-09-01</td>\n",
       "      <td>2011-09</td>\n",
       "      <td>2011-Q3</td>\n",
       "      <td>2011</td>\n",
       "      <td>250000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52847</th>\n",
       "      <td>Square</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zachary Bogue</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2009-11-01</td>\n",
       "      <td>2009-11</td>\n",
       "      <td>2009-Q4</td>\n",
       "      <td>2009</td>\n",
       "      <td>10000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52848</th>\n",
       "      <td>MixRank</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zachary Bogue</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2011-11-18</td>\n",
       "      <td>2011-11</td>\n",
       "      <td>2011-Q4</td>\n",
       "      <td>2011</td>\n",
       "      <td>1500000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52849</th>\n",
       "      <td>Socialcam</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>Santa Clara County</td>\n",
       "      <td>Santa Clara County</td>\n",
       "      <td>Zachary Bogue</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-04-30</td>\n",
       "      <td>2012-04</td>\n",
       "      <td>2012-Q2</td>\n",
       "      <td>2012</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52850</th>\n",
       "      <td>Nuzzel</td>\n",
       "      <td>news</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zachary Bogue</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>venture</td>\n",
       "      <td>2012-11-15</td>\n",
       "      <td>2012-11</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>1700000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52851</th>\n",
       "      <td>ThirdLove</td>\n",
       "      <td>ecommerce</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zachary Bogue</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-12-01</td>\n",
       "      <td>2012-12</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>5600000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52852</th>\n",
       "      <td>MXD3D</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zaid Ayoub</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-01-01</td>\n",
       "      <td>2012-01</td>\n",
       "      <td>2012-Q1</td>\n",
       "      <td>2012</td>\n",
       "      <td>300000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52853</th>\n",
       "      <td>MXD3D</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zaid Ayoub</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-01-01</td>\n",
       "      <td>2011-01</td>\n",
       "      <td>2011-Q1</td>\n",
       "      <td>2011</td>\n",
       "      <td>300000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52854</th>\n",
       "      <td>Verious</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Carlos</td>\n",
       "      <td>Zain Khan</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2011-05-30</td>\n",
       "      <td>2011-05</td>\n",
       "      <td>2011-Q2</td>\n",
       "      <td>2011</td>\n",
       "      <td>800000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52855</th>\n",
       "      <td>Identified</td>\n",
       "      <td>analytics</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Zao Yang</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-b</td>\n",
       "      <td>2012-06-05</td>\n",
       "      <td>2012-06</td>\n",
       "      <td>2012-Q2</td>\n",
       "      <td>2012</td>\n",
       "      <td>21000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52856</th>\n",
       "      <td>HaulerDeals</td>\n",
       "      <td>fashion</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>Zaw Thet</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2012-10-31</td>\n",
       "      <td>2012-10</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>1250000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52857</th>\n",
       "      <td>When You Wish</td>\n",
       "      <td>nonprofit</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>Marina Del Rey</td>\n",
       "      <td>Zelda Marzec</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2011-02-01</td>\n",
       "      <td>2011-02</td>\n",
       "      <td>2011-Q1</td>\n",
       "      <td>2011</td>\n",
       "      <td>1500000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52858</th>\n",
       "      <td>Farmeron</td>\n",
       "      <td>analytics</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Mountain View</td>\n",
       "      <td>Zeljko Mataija</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2010-10-01</td>\n",
       "      <td>2010-10</td>\n",
       "      <td>2010-Q4</td>\n",
       "      <td>2010</td>\n",
       "      <td>15000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52859</th>\n",
       "      <td>Theraclone Sciences</td>\n",
       "      <td>biotech</td>\n",
       "      <td>USA</td>\n",
       "      <td>WA</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>Seattle</td>\n",
       "      <td>Zenyaku Kogyo</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-b</td>\n",
       "      <td>2013-03-25</td>\n",
       "      <td>2013-03</td>\n",
       "      <td>2013-Q1</td>\n",
       "      <td>2013</td>\n",
       "      <td>8000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52860</th>\n",
       "      <td>SimpleGeo</td>\n",
       "      <td>advertising</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>Ziv Navoth</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>other</td>\n",
       "      <td>2009-11-10</td>\n",
       "      <td>2009-11</td>\n",
       "      <td>2009-Q4</td>\n",
       "      <td>2009</td>\n",
       "      <td>195000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52861</th>\n",
       "      <td>Open Me</td>\n",
       "      <td>ecommerce</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>Los Angeles</td>\n",
       "      <td>Ziver Birg</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2013-08-01</td>\n",
       "      <td>2013-08</td>\n",
       "      <td>2013-Q3</td>\n",
       "      <td>2013</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52862</th>\n",
       "      <td>Comprehend Systems</td>\n",
       "      <td>enterprise</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Palo Alto</td>\n",
       "      <td>Zod Nazem</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-07-11</td>\n",
       "      <td>2013-07</td>\n",
       "      <td>2013-Q3</td>\n",
       "      <td>2013</td>\n",
       "      <td>8400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52863</th>\n",
       "      <td>Payoneer</td>\n",
       "      <td>other</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York</td>\n",
       "      <td>Zohar Gilon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2005-01-01</td>\n",
       "      <td>2005-01</td>\n",
       "      <td>2005-Q1</td>\n",
       "      <td>2005</td>\n",
       "      <td>2000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52864</th>\n",
       "      <td>Outbrain</td>\n",
       "      <td>web</td>\n",
       "      <td>USA</td>\n",
       "      <td>NY</td>\n",
       "      <td>New York</td>\n",
       "      <td>New York City</td>\n",
       "      <td>Zohar Gilon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-b</td>\n",
       "      <td>2009-02-11</td>\n",
       "      <td>2009-02</td>\n",
       "      <td>2009-Q1</td>\n",
       "      <td>2009</td>\n",
       "      <td>12000000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52865</th>\n",
       "      <td>Garantia Data</td>\n",
       "      <td>enterprise</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Santa Clara</td>\n",
       "      <td>Zohar Gilon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-08-08</td>\n",
       "      <td>2012-08</td>\n",
       "      <td>2012-Q3</td>\n",
       "      <td>2012</td>\n",
       "      <td>3800000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52866</th>\n",
       "      <td>DudaMobile</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Palo Alto</td>\n",
       "      <td>Zohar Gilon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-c+</td>\n",
       "      <td>2013-04-08</td>\n",
       "      <td>2013-04</td>\n",
       "      <td>2013-Q2</td>\n",
       "      <td>2013</td>\n",
       "      <td>10300000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52867</th>\n",
       "      <td>SiteBrains</td>\n",
       "      <td>software</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>San Francisco</td>\n",
       "      <td>zohar israel</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>angel</td>\n",
       "      <td>2010-08-01</td>\n",
       "      <td>2010-08</td>\n",
       "      <td>2010-Q3</td>\n",
       "      <td>2010</td>\n",
       "      <td>350000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52868</th>\n",
       "      <td>Comprehend Systems</td>\n",
       "      <td>enterprise</td>\n",
       "      <td>USA</td>\n",
       "      <td>CA</td>\n",
       "      <td>SF Bay</td>\n",
       "      <td>Palo Alto</td>\n",
       "      <td>Zorba Lieberman</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2013-07-11</td>\n",
       "      <td>2013-07</td>\n",
       "      <td>2013-Q3</td>\n",
       "      <td>2013</td>\n",
       "      <td>8400000.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52869</th>\n",
       "      <td>SmartThings</td>\n",
       "      <td>mobile</td>\n",
       "      <td>USA</td>\n",
       "      <td>DC</td>\n",
       "      <td>unknown</td>\n",
       "      <td>Minneapolis</td>\n",
       "      <td>Zorik Gordon</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>NaN</td>\n",
       "      <td>series-a</td>\n",
       "      <td>2012-12-04</td>\n",
       "      <td>2012-12</td>\n",
       "      <td>2012-Q4</td>\n",
       "      <td>2012</td>\n",
       "      <td>3000000.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2870 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                  company_name company_category_code company_country_code  \\\n",
       "50000                  NuORDER               fashion                  USA   \n",
       "50001                   ChaCha           advertising                  USA   \n",
       "50002                  Binfire              software                  USA   \n",
       "50003                  Binfire              software                  USA   \n",
       "50004            Unified Color              software                  USA   \n",
       "50005                 HItviews           advertising                  USA   \n",
       "50006               LockerDome                social                  USA   \n",
       "50007                ThirdLove             ecommerce                  USA   \n",
       "50008                    Hakia                search                  USA   \n",
       "50009               bookacoach                sports                  USA   \n",
       "50010             LocalCircles                social                  USA   \n",
       "50011                Graphdive             analytics                  USA   \n",
       "50012                   Ribbon             ecommerce                  USA   \n",
       "50013            Dokkankom.com             ecommerce                  USA   \n",
       "50014                  Lookery                   web                  USA   \n",
       "50015             TrustDegrees                   web                  USA   \n",
       "50016                  Altavoz           games_video                  USA   \n",
       "50017                  EdSurge             education                  USA   \n",
       "50018              FullContact            enterprise                  USA   \n",
       "50019               Rapt Media            enterprise                  USA   \n",
       "50020                 Humanoid              software                  USA   \n",
       "50021                 Runscope                   web                  USA   \n",
       "50022                   Adzerk           advertising                  USA   \n",
       "50023                  Adaptly           advertising                  USA   \n",
       "50024                     Lore             education                  USA   \n",
       "50025              Tasted Menu           hospitality                  USA   \n",
       "50026         Lua Technologies                mobile                  USA   \n",
       "50027               Blue Apron           hospitality                  USA   \n",
       "50028                   ChatID                mobile                  USA   \n",
       "50029  Breakthrough Behavioral                health                  USA   \n",
       "...                        ...                   ...                  ...   \n",
       "52840                   Meddik                health                  USA   \n",
       "52841               Blue Apron           hospitality                  USA   \n",
       "52842                   ChatID                mobile                  USA   \n",
       "52843  Breakthrough Behavioral                health                  USA   \n",
       "52844                    Plaid              software                  USA   \n",
       "52845                 PokitDok                mobile                  USA   \n",
       "52846                Fitocracy                   web                  USA   \n",
       "52847                   Square                mobile                  USA   \n",
       "52848                  MixRank           advertising                  USA   \n",
       "52849                Socialcam                mobile                  USA   \n",
       "52850                   Nuzzel                  news                  USA   \n",
       "52851                ThirdLove             ecommerce                  USA   \n",
       "52852                    MXD3D                   web                  USA   \n",
       "52853                    MXD3D                   web                  USA   \n",
       "52854                  Verious                mobile                  USA   \n",
       "52855               Identified             analytics                  USA   \n",
       "52856              HaulerDeals               fashion                  USA   \n",
       "52857            When You Wish             nonprofit                  USA   \n",
       "52858                 Farmeron             analytics                  USA   \n",
       "52859      Theraclone Sciences               biotech                  USA   \n",
       "52860                SimpleGeo           advertising                  USA   \n",
       "52861                  Open Me             ecommerce                  USA   \n",
       "52862       Comprehend Systems            enterprise                  USA   \n",
       "52863                 Payoneer                 other                  USA   \n",
       "52864                 Outbrain                   web                  USA   \n",
       "52865            Garantia Data            enterprise                  USA   \n",
       "52866               DudaMobile                mobile                  USA   \n",
       "52867               SiteBrains              software                  USA   \n",
       "52868       Comprehend Systems            enterprise                  USA   \n",
       "52869              SmartThings                mobile                  USA   \n",
       "\n",
       "      company_state_code      company_region         company_city  \\\n",
       "50000                 CA         Los Angeles       West Hollywood   \n",
       "50001                 IN        Indianapolis               Carmel   \n",
       "50002                 FL         Bocat Raton          Bocat Raton   \n",
       "50003                 FL         Bocat Raton          Bocat Raton   \n",
       "50004                 CA              SF Bay  South San Frnacisco   \n",
       "50005                 NY            New York        New York City   \n",
       "50006                 MO         Saint Louis            St. Louis   \n",
       "50007                 CA              SF Bay        San Francisco   \n",
       "50008                NaN                 TBD                  NaN   \n",
       "50009                 IN        Indianapolis         Indianapolis   \n",
       "50010                 CA              SF Bay          Santa Clara   \n",
       "50011                 CA              SF Bay           Menlo Park   \n",
       "50012                 CA              SF Bay        San Francisco   \n",
       "50013                 NY            New York             new york   \n",
       "50014                 CA              SF Bay        San Francisco   \n",
       "50015                 NY             Kenmore              Kenmore   \n",
       "50016                 DC       Washington DC           Washington   \n",
       "50017                 CA              SF Bay           Burlingame   \n",
       "50018                 CO              Denver               Denver   \n",
       "50019                 CO              Denver              Boulder   \n",
       "50020                 CA              SF Bay        San Francisco   \n",
       "50021                 CA              SF Bay        San Francisco   \n",
       "50022                 NC      Raleigh-Durham               Durham   \n",
       "50023                 NY            New York             New York   \n",
       "50024                 NY            New York             New York   \n",
       "50025                 MA              Boston               Boston   \n",
       "50026                 NY            New York             New York   \n",
       "50027                 NY            New York             Brooklyn   \n",
       "50028                 NY            New York             New York   \n",
       "50029                 CA              SF Bay         Redwood City   \n",
       "...                  ...                 ...                  ...   \n",
       "52840                 NY            New York             New York   \n",
       "52841                 NY            New York             Brooklyn   \n",
       "52842                 NY            New York             New York   \n",
       "52843                 CA              SF Bay         Redwood City   \n",
       "52844                 CA              SF Bay        San Francisco   \n",
       "52845                 CA              SF Bay           Menlo Park   \n",
       "52846                 NY            New York             New York   \n",
       "52847                 CA              SF Bay        San Francisco   \n",
       "52848                 CA              SF Bay        San Francisco   \n",
       "52849                 CA  Santa Clara County   Santa Clara County   \n",
       "52850                 CA              SF Bay        San Francisco   \n",
       "52851                 CA              SF Bay        San Francisco   \n",
       "52852                 CA              SF Bay        San Francisco   \n",
       "52853                 CA              SF Bay        San Francisco   \n",
       "52854                 CA              SF Bay           San Carlos   \n",
       "52855                 CA              SF Bay        San Francisco   \n",
       "52856                 CA         Los Angeles          Los Angeles   \n",
       "52857                 CA         Los Angeles       Marina Del Rey   \n",
       "52858                 CA              SF Bay        Mountain View   \n",
       "52859                 WA             Seattle              Seattle   \n",
       "52860                 CA              SF Bay        San Francisco   \n",
       "52861                 CA         Los Angeles          Los Angeles   \n",
       "52862                 CA              SF Bay            Palo Alto   \n",
       "52863                 NY            New York             New York   \n",
       "52864                 NY            New York        New York City   \n",
       "52865                 CA              SF Bay          Santa Clara   \n",
       "52866                 CA              SF Bay            Palo Alto   \n",
       "52867                 CA              SF Bay        San Francisco   \n",
       "52868                 CA              SF Bay            Palo Alto   \n",
       "52869                 DC             unknown          Minneapolis   \n",
       "\n",
       "          investor_name  investor_country_code  investor_state_code  \\\n",
       "50000   Mortimer Singer                    NaN                  NaN   \n",
       "50001   Morton Meyerson                    NaN                  NaN   \n",
       "50002       Moshe Ariel                    NaN                  NaN   \n",
       "50003       Moshe Ariel                    NaN                  NaN   \n",
       "50004   Mr. Andrew Oung                    NaN                  NaN   \n",
       "50005  multiple parties                    NaN                  NaN   \n",
       "50006  multiple parties                    NaN                  NaN   \n",
       "50007       Munjal Shah                    NaN                  NaN   \n",
       "50008       Murat Vargi                    NaN                  NaN   \n",
       "50009       Myles Grote                    NaN                  NaN   \n",
       "50010      Nadir Godrej                    NaN                  NaN   \n",
       "50011    Naguib Sawiris                    NaN                  NaN   \n",
       "50012    Naguib Sawiris                    NaN                  NaN   \n",
       "50013       Namek Zu'bi                    NaN                  NaN   \n",
       "50014         Nana Shin                    NaN                  NaN   \n",
       "50015     Nancy Barrett                    NaN                  NaN   \n",
       "50016    Nancy Jacobsen                    NaN                  NaN   \n",
       "50017   Nancy Peretsman                    NaN                  NaN   \n",
       "50018      Nancy Pierce                    NaN                  NaN   \n",
       "50019      Nancy Pierce                    NaN                  NaN   \n",
       "50020      Nat Friedman                    NaN                  NaN   \n",
       "50021      Nat Friedman                    NaN                  NaN   \n",
       "50022        Nat Turner                    NaN                  NaN   \n",
       "50023        Nat Turner                    NaN                  NaN   \n",
       "50024        Nat Turner                    NaN                  NaN   \n",
       "50025        Nat Turner                    NaN                  NaN   \n",
       "50026        Nat Turner                    NaN                  NaN   \n",
       "50027        Nat Turner                    NaN                  NaN   \n",
       "50028        Nat Turner                    NaN                  NaN   \n",
       "50029        Nat Turner                    NaN                  NaN   \n",
       "...                 ...                    ...                  ...   \n",
       "52840     Zach Weinberg                    NaN                  NaN   \n",
       "52841     Zach Weinberg                    NaN                  NaN   \n",
       "52842     Zach Weinberg                    NaN                  NaN   \n",
       "52843     Zach Weinberg                    NaN                  NaN   \n",
       "52844     Zach Weinberg                    NaN                  NaN   \n",
       "52845      Zach Zeitlin                    NaN                  NaN   \n",
       "52846    Zachary Aarons                    NaN                  NaN   \n",
       "52847     Zachary Bogue                    NaN                  NaN   \n",
       "52848     Zachary Bogue                    NaN                  NaN   \n",
       "52849     Zachary Bogue                    NaN                  NaN   \n",
       "52850     Zachary Bogue                    NaN                  NaN   \n",
       "52851     Zachary Bogue                    NaN                  NaN   \n",
       "52852        Zaid Ayoub                    NaN                  NaN   \n",
       "52853        Zaid Ayoub                    NaN                  NaN   \n",
       "52854         Zain Khan                    NaN                  NaN   \n",
       "52855          Zao Yang                    NaN                  NaN   \n",
       "52856          Zaw Thet                    NaN                  NaN   \n",
       "52857      Zelda Marzec                    NaN                  NaN   \n",
       "52858    Zeljko Mataija                    NaN                  NaN   \n",
       "52859     Zenyaku Kogyo                    NaN                  NaN   \n",
       "52860        Ziv Navoth                    NaN                  NaN   \n",
       "52861        Ziver Birg                    NaN                  NaN   \n",
       "52862         Zod Nazem                    NaN                  NaN   \n",
       "52863       Zohar Gilon                    NaN                  NaN   \n",
       "52864       Zohar Gilon                    NaN                  NaN   \n",
       "52865       Zohar Gilon                    NaN                  NaN   \n",
       "52866       Zohar Gilon                    NaN                  NaN   \n",
       "52867      zohar israel                    NaN                  NaN   \n",
       "52868   Zorba Lieberman                    NaN                  NaN   \n",
       "52869      Zorik Gordon                    NaN                  NaN   \n",
       "\n",
       "      investor_region  investor_city funding_round_type   funded_at  \\\n",
       "50000         unknown            NaN           series-a  2012-10-01   \n",
       "50001         unknown            NaN           series-b  2007-10-01   \n",
       "50002         unknown            NaN              angel  2008-04-18   \n",
       "50003         unknown            NaN              angel  2010-01-01   \n",
       "50004         unknown            NaN              angel  2010-01-01   \n",
       "50005         unknown            NaN              angel  2007-11-29   \n",
       "50006         unknown            NaN              angel  2012-04-17   \n",
       "50007         unknown            NaN           series-a  2012-12-01   \n",
       "50008         unknown            NaN           series-a  2006-11-01   \n",
       "50009         unknown            NaN              angel  2012-11-01   \n",
       "50010         unknown            NaN              angel  2012-09-01   \n",
       "50011         unknown            NaN              angel  2012-10-04   \n",
       "50012         unknown            NaN           series-a  2013-02-05   \n",
       "50013         unknown            NaN              angel  2011-10-10   \n",
       "50014         unknown            NaN              angel  2008-02-07   \n",
       "50015         unknown            NaN              angel  2011-06-09   \n",
       "50016         unknown            NaN              angel  2012-09-11   \n",
       "50017         unknown            NaN              angel  2012-08-29   \n",
       "50018         unknown            NaN           series-b  2012-07-09   \n",
       "50019         unknown            NaN           series-a  2013-01-23   \n",
       "50020         unknown            NaN              angel  2010-12-01   \n",
       "50021         unknown            NaN              angel  2013-05-22   \n",
       "50022         unknown            NaN              angel  2011-07-12   \n",
       "50023         unknown            NaN           series-a  2011-04-18   \n",
       "50024         unknown            NaN              angel  2011-06-27   \n",
       "50025         unknown            NaN              angel  2011-05-01   \n",
       "50026         unknown            NaN           series-a  2012-08-01   \n",
       "50027         unknown            NaN           series-a  2013-02-19   \n",
       "50028         unknown            NaN              angel  2012-01-01   \n",
       "50029         unknown            NaN              angel  2012-08-13   \n",
       "...               ...            ...                ...         ...   \n",
       "52840         unknown            NaN              angel  2012-05-24   \n",
       "52841         unknown            NaN           series-a  2013-02-19   \n",
       "52842         unknown            NaN              angel  2012-01-01   \n",
       "52843         unknown            NaN              angel  2012-08-13   \n",
       "52844         unknown            NaN           series-a  2013-09-19   \n",
       "52845         unknown            NaN              angel  2012-07-12   \n",
       "52846         unknown            NaN              angel  2011-09-01   \n",
       "52847         unknown            NaN           series-a  2009-11-01   \n",
       "52848         unknown            NaN           series-a  2011-11-18   \n",
       "52849         unknown            NaN              angel  2012-04-30   \n",
       "52850         unknown            NaN            venture  2012-11-15   \n",
       "52851         unknown            NaN           series-a  2012-12-01   \n",
       "52852         unknown            NaN              angel  2012-01-01   \n",
       "52853         unknown            NaN              angel  2011-01-01   \n",
       "52854         unknown            NaN              angel  2011-05-30   \n",
       "52855         unknown            NaN           series-b  2012-06-05   \n",
       "52856         unknown            NaN              angel  2012-10-31   \n",
       "52857         unknown            NaN           series-a  2011-02-01   \n",
       "52858         unknown            NaN              angel  2010-10-01   \n",
       "52859         unknown            NaN           series-b  2013-03-25   \n",
       "52860         unknown            NaN              other  2009-11-10   \n",
       "52861         unknown            NaN              angel  2013-08-01   \n",
       "52862         unknown            NaN           series-a  2013-07-11   \n",
       "52863         unknown            NaN           series-a  2005-01-01   \n",
       "52864         unknown            NaN           series-b  2009-02-11   \n",
       "52865         unknown            NaN           series-a  2012-08-08   \n",
       "52866         unknown            NaN          series-c+  2013-04-08   \n",
       "52867         unknown            NaN              angel  2010-08-01   \n",
       "52868         unknown            NaN           series-a  2013-07-11   \n",
       "52869         unknown            NaN           series-a  2012-12-04   \n",
       "\n",
       "      funded_month funded_quarter  funded_year  raised_amount_usd  \n",
       "50000      2012-10        2012-Q4         2012          3060000.0  \n",
       "50001      2007-10        2007-Q4         2007         12000000.0  \n",
       "50002      2008-04        2008-Q2         2008           500000.0  \n",
       "50003      2010-01        2010-Q1         2010           750000.0  \n",
       "50004      2010-01        2010-Q1         2010                NaN  \n",
       "50005      2007-11        2007-Q4         2007           485000.0  \n",
       "50006      2012-04        2012-Q2         2012           300000.0  \n",
       "50007      2012-12        2012-Q4         2012          5600000.0  \n",
       "50008      2006-11        2006-Q4         2006         16000000.0  \n",
       "50009      2012-11        2012-Q4         2012                NaN  \n",
       "50010      2012-09        2012-Q3         2012                NaN  \n",
       "50011      2012-10        2012-Q4         2012          1000000.0  \n",
       "50012      2013-02        2013-Q1         2013          1630000.0  \n",
       "50013      2011-10        2011-Q4         2011            30000.0  \n",
       "50014      2008-02        2008-Q1         2008           900000.0  \n",
       "50015      2011-06        2011-Q2         2011             8000.0  \n",
       "50016      2012-09        2012-Q3         2012           150000.0  \n",
       "50017      2012-08        2012-Q3         2012           400000.0  \n",
       "50018      2012-07        2012-Q3         2012          7000000.0  \n",
       "50019      2013-01        2013-Q1         2013          2288803.0  \n",
       "50020      2010-12        2010-Q4         2010          1100000.0  \n",
       "50021      2013-05        2013-Q2         2013          1100000.0  \n",
       "50022      2011-07        2011-Q3         2011           650000.0  \n",
       "50023      2011-04        2011-Q2         2011          2000000.0  \n",
       "50024      2011-06        2011-Q2         2011          1000000.0  \n",
       "50025      2011-05        2011-Q2         2011                NaN  \n",
       "50026      2012-08        2012-Q3         2012          2500000.0  \n",
       "50027      2013-02        2013-Q1         2013          3000000.0  \n",
       "50028      2012-01        2012-Q1         2012                NaN  \n",
       "50029      2012-08        2012-Q3         2012           900000.0  \n",
       "...            ...            ...          ...                ...  \n",
       "52840      2012-05        2012-Q2         2012           750000.0  \n",
       "52841      2013-02        2013-Q1         2013          3000000.0  \n",
       "52842      2012-01        2012-Q1         2012                NaN  \n",
       "52843      2012-08        2012-Q3         2012           900000.0  \n",
       "52844      2013-09        2013-Q3         2013          2800000.0  \n",
       "52845      2012-07        2012-Q3         2012          1300000.0  \n",
       "52846      2011-09        2011-Q3         2011           250000.0  \n",
       "52847      2009-11        2009-Q4         2009         10000000.0  \n",
       "52848      2011-11        2011-Q4         2011          1500000.0  \n",
       "52849      2012-04        2012-Q2         2012                NaN  \n",
       "52850      2012-11        2012-Q4         2012          1700000.0  \n",
       "52851      2012-12        2012-Q4         2012          5600000.0  \n",
       "52852      2012-01        2012-Q1         2012           300000.0  \n",
       "52853      2011-01        2011-Q1         2011           300000.0  \n",
       "52854      2011-05        2011-Q2         2011           800000.0  \n",
       "52855      2012-06        2012-Q2         2012         21000000.0  \n",
       "52856      2012-10        2012-Q4         2012          1250000.0  \n",
       "52857      2011-02        2011-Q1         2011          1500000.0  \n",
       "52858      2010-10        2010-Q4         2010            15000.0  \n",
       "52859      2013-03        2013-Q1         2013          8000000.0  \n",
       "52860      2009-11        2009-Q4         2009           195000.0  \n",
       "52861      2013-08        2013-Q3         2013                NaN  \n",
       "52862      2013-07        2013-Q3         2013          8400000.0  \n",
       "52863      2005-01        2005-Q1         2005          2000000.0  \n",
       "52864      2009-02        2009-Q1         2009         12000000.0  \n",
       "52865      2012-08        2012-Q3         2012          3800000.0  \n",
       "52866      2013-04        2013-Q2         2013         10300000.0  \n",
       "52867      2010-08        2010-Q3         2010           350000.0  \n",
       "52868      2013-07        2013-Q3         2013          8400000.0  \n",
       "52869      2012-12        2012-Q4         2012          3000000.0  \n",
       "\n",
       "[2870 rows x 17 columns]"
      ]
     },
     "execution_count": 127,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "col_mem = {}\n",
    "for chunk in crunch_chunks:\n",
    "    dropped_chunk=chunk.drop(['investor_category_code', 'company_permalink', 'investor_permalink'], axis=1)\n",
    "    cols = list(dropped_chunk.columns)\n",
    "\n",
    "print (dropped_chunk.shape[1])\n",
    "print (cols)\n",
    "dropped_chunk"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify the types for each column\n",
    "\n",
    "- We could just simply get column dtypes from df.dtypes as follows. Let's see for information sake\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "company_name              object\n",
       "company_category_code     object\n",
       "company_country_code      object\n",
       "company_state_code        object\n",
       "company_region            object\n",
       "company_city              object\n",
       "investor_name             object\n",
       "investor_country_code    float64\n",
       "investor_state_code      float64\n",
       "investor_region           object\n",
       "investor_city            float64\n",
       "funding_round_type        object\n",
       "funded_at                 object\n",
       "funded_month              object\n",
       "funded_quarter            object\n",
       "funded_year                int64\n",
       "raised_amount_usd        float64\n",
       "dtype: object"
      ]
     },
     "execution_count": 128,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dropped_chunk.dtypes"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now, let's apply a different approach to this:\n",
    "\n",
    "- For each chunk, and for each column within a chunk, form a dictionary of columns as keys and types of each column as  list of values. This allows us to find all data types in case some columns may have mixed types. \n",
    "\n",
    "- Remove duplicate/recurring data types from the dictionary values list for each key \n",
    "\n",
    "- From the non-duplicated values of the  dictionary, we can see if there's a difference this approach shows compared to the last result"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's write a function for removing duplicate values from a dictionary, where the key values are in a List. It will return the dictionary where each key value is a list of only unique values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def dedup_dictionary(dict_name):\n",
    "    for key in dict_name:\n",
    "        dict_name[key] = list(dict.fromkeys(dict_name[key]))\n",
    "    return dict_name "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'company_category_code': ['object'],\n",
       " 'company_city': ['object'],\n",
       " 'company_country_code': ['object'],\n",
       " 'company_name': ['object'],\n",
       " 'company_region': ['object'],\n",
       " 'company_state_code': ['object'],\n",
       " 'funded_at': ['object'],\n",
       " 'funded_month': ['object'],\n",
       " 'funded_quarter': ['object'],\n",
       " 'funded_year': ['int64', 'float64'],\n",
       " 'funding_round_type': ['object'],\n",
       " 'investor_city': ['object', 'float64'],\n",
       " 'investor_country_code': ['object', 'float64'],\n",
       " 'investor_name': ['object'],\n",
       " 'investor_region': ['object'],\n",
       " 'investor_state_code': ['object', 'float64'],\n",
       " 'raised_amount_usd': ['float64']}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "col_types = {}\n",
    "for chunk in crunch_chunks:\n",
    "    dropped_chunk=chunk.drop(['investor_category_code', 'company_permalink', 'investor_permalink'], axis=1)\n",
    "    cols = list(dropped_chunk.columns)\n",
    "    for col in cols:\n",
    "        if col in col_types:\n",
    "            col_types[col].append(str(dropped_chunk.dtypes[col]))\n",
    "        else:\n",
    "            col_types[col] =[str(dropped_chunk.dtypes[col])]\n",
    "\n",
    "dedup_dictionary(col_types)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Identify the numeric columns we can represent using more space efficient types"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['investor_country_code',\n",
       " 'investor_state_code',\n",
       " 'investor_city',\n",
       " 'funded_year',\n",
       " 'raised_amount_usd']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "list(dropped_chunk.select_dtypes(include=[np.number]).columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For text columns:\n",
    "\n",
    "Analyze the unique value counts across all of the chunks to see if we can convert them to a numeric type\n",
    "\n",
    "- Get the list of Text columns\n",
    "- For each chunk, get the data frame containing only Text columns, using chunk.select_dtypes(inlude=[datatype])\n",
    "- For each column, get the chunk's value counts \n",
    "- Form a dictionary of columns. Append value counts as values\n",
    "- Once, the dictionary is ready, for each column, combine the values for each key using pd.concat(dictionary[key])\n",
    "- Then we group all values associated with each index by using *groupby()* on the above result\n",
    "Once grouped by their corresponding indexes, we will use the Series.sum() method to return the final, unique value counts in the form of another dictionary with columns as keys.\n",
    "\n",
    "Display the columns and key values from this dictionary\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['company_name', 'company_category_code', 'company_country_code', 'company_state_code', 'company_region', 'company_city', 'investor_name', 'investor_region', 'funding_round_type', 'funded_at', 'funded_month', 'funded_quarter']\n",
      "company_region 546\n",
      "funded_month 192\n",
      "company_name 11573\n",
      "funding_round_type 9\n",
      "funded_quarter 72\n",
      "company_city 1229\n",
      "investor_name 10465\n",
      "company_category_code 43\n",
      "investor_region 585\n",
      "company_country_code 2\n",
      "company_state_code 50\n",
      "funded_at 2808\n"
     ]
    }
   ],
   "source": [
    "crunch_chunks = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "vc_dict = {}\n",
    "obj_cols = list(dropped_chunk.select_dtypes(include=['object']).columns)\n",
    "print (obj_cols)\n",
    "for chunk in crunch_chunks:\n",
    "    text_chunk = chunk.select_dtypes(include=['object'])\n",
    "    for col in obj_cols:\n",
    "        tchunk_vc = text_chunk[col].value_counts()\n",
    "        if col in vc_dict:\n",
    "            vc_dict[col].append(tchunk_vc)\n",
    "        else:\n",
    "            vc_dict[col] = [tchunk_vc]\n",
    "\n",
    "final_dict = {}\n",
    "for col in vc_dict:\n",
    "    combined_vc = pd.concat(vc_dict[col])\n",
    "    final_vc = combined_vc.groupby(combined_vc.index).sum()\n",
    "    final_dict[col] = final_vc \n",
    "    print (col, len(final_vc))\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Upload the rows in each dataframe chunk to a SQLite database table\n",
    "\n",
    "- Let's create and connect to a new SQLite database file called *crunchbase.db*\n",
    "- Using dataframe chunk iterator, we will first export each chunk to a new table in the database called *ventures* \n",
    "- We will then use pandas to read the database table into a dataframe returning the column types for the newly created table. \n",
    "We will use query **PRAGMA table_info(table_name)** to return information on the table's column data types.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "    cid                    name     type  notnull dflt_value  pk\n",
      "0     0       company_permalink     TEXT        0       None   0\n",
      "1     1            company_name     TEXT        0       None   0\n",
      "2     2   company_category_code     TEXT        0       None   0\n",
      "3     3    company_country_code     TEXT        0       None   0\n",
      "4     4      company_state_code     TEXT        0       None   0\n",
      "5     5          company_region     TEXT        0       None   0\n",
      "6     6            company_city     TEXT        0       None   0\n",
      "7     7      investor_permalink     TEXT        0       None   0\n",
      "8     8           investor_name     TEXT        0       None   0\n",
      "9     9  investor_category_code     TEXT        0       None   0\n",
      "10   10   investor_country_code     TEXT        0       None   0\n",
      "11   11     investor_state_code     TEXT        0       None   0\n",
      "12   12         investor_region     TEXT        0       None   0\n",
      "13   13           investor_city     TEXT        0       None   0\n",
      "14   14      funding_round_type     TEXT        0       None   0\n",
      "15   15               funded_at     TEXT        0       None   0\n",
      "16   16            funded_month     TEXT        0       None   0\n",
      "17   17          funded_quarter     TEXT        0       None   0\n",
      "18   18             funded_year  INTEGER        0       None   0\n",
      "19   19       raised_amount_usd     REAL        0       None   0\n"
     ]
    }
   ],
   "source": [
    "import sqlite3\n",
    "\n",
    "conn = sqlite3.connect('crunchbase.db')\n",
    "crunch_iter = pd.read_csv(csvfile, encoding='ISO-8859-1', chunksize=5000)\n",
    "for chunk in crunch_iter:\n",
    "    chunk.to_sql('ventures', conn, if_exists='append', index=False)\n",
    "results_df = pd.read_sql('PRAGMA table_info(\"ventures\")', conn)\n",
    "print (results_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Column dtypes match with results in the previous step where  \n",
    "\n",
    "*funded_year*  was  int64 \n",
    "\n",
    "*raised_amount_usd* was float64 \n",
    "\n",
    "It looks *Text* type are either Object or mixed Object&Float\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's find the size of *crunchbase.db* database file\n",
    "1. Using  *ls -l* command\n",
    "2. Using wc command "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-rw-r--r-- 1 dq root 32329728 Jun 24 21:19 crunchbase.db\r\n"
     ]
    }
   ],
   "source": [
    "!ls -l crunchbase.db"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32329728 crunchbase.db\r\n"
     ]
    }
   ],
   "source": [
    "!wc -c crunchbase.db"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The database file is 32MB"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}