{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "code", "collapsed": false, "input": [ "#load pandas\n", "import pandas as pd\n", "from IPython.parallel import Client\n", "from httplib import HTTPConnection\n", "\n", "\n", "cred, api_key, ep, url = None, None, None, None\n", "\n", "#path = '/Users/gk/Desktop/data_sci'\n", "path='/home/gkandlikar'\n", "\n", "def load_essentials():\n", " #load credentials, api key, url, end points, etc\n", " global cred, api_key, url, ep, path\n", " cred = pd.read_csv(''.join((path,'/credentials/credentials.csv')))\n", " url = 'api.crunchbase.com'\n", " ep = ['/v/1/companies.js?','/v/1/company/']\n", " with open(''.join((path,'/crunchbase/key')), 'r') as f:\n", " api_key = f.read().strip()\n", " return\n", "\n", "\n", "def gen_s3_obj(credentials, bucket=''):\n", " from boto.s3.connection import S3Connection\n", " s3conn=S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])\n", " if bucket != '':\n", " s3bucket = s3conn.get_bucket(bucket)\n", " return s3conn, s3bucket\n", "\n", "def fetch_and_file_comp_db(key = False):\n", " if not key:\n", " cb_conn=HTTPConnection(url)\n", " \n", " #initiate connection with crunchbase\n", " query = ''.join((ep[0],'api_key=',api_key))\n", " cb_conn.request(\"GET\",query)\n", " data=cb_conn.getresponse().read()\n", " \n", " #make a query for the file, read the request\n", " s3conn, s3bucket = gen_s3_obj(cred)\n", " compfile=s3bucket.new_key('crunchbase/companies.json')\n", " compfile.set_contents_from_string(data)\n", " s3conn.close()\n", " \n", " #create key on S3, set its contents\n", " return json.loads(data, strict = False)\n", " \n", " else:\n", " return json.loads(key.get_contents_as_string(), strict = False)\n", "\n", "def fetch_company(company):\n", " #establish connections \n", " cb_conn=HTTPConnection(url)\n", " s3conn, s3bucket=gen_s3_obj(cred, bucket='kandlikards')\n", " \n", " keyname=''.join(('crunchbase/',company['permalink'],'.json'))\n", " key = s3bucket.get_key(keyname)\n", " \n", " if ((not key) or ('Developer Over Qps' in key.get_contents_as_string())):\n", " #retrieve file from crunchbase\n", " #build query:'/v/1/company/company-name.js?api_key='\n", " query=''.join((ep[1],company['permalink'],'.js?api_key=',api_key))\n", " \n", " #initiate and read connection\n", " cb_conn.request(\"GET\",query)\n", " data=cb_conn.getresponse().read()\n", " \n", " while 'Developer Over Qps' in data:\n", " time.sleep(5)\n", " cb_conn.request(\"GET\",query)\n", " data=cb_conn.getresponse().read()\n", " \n", " key=s3bucket.new_key(keyname)\n", " key.set_contents_from_string(data)\n", " \n", " else :\n", " data=key.get_contents_as_string()\n", " \n", " try :\n", " json_data=json.loads(data, strict=False)\n", " except :\n", " json_data={'error': 'Load Error during json.loads()'}\n", " \n", " if 'error' in json_data.keys():\n", " retval=None\n", " else :\n", " try :\n", " retval = (company['permalink'],company['name'], company['category_code'], json_data['overview'], json_data['description'], json_data['tag_list'], None, None)\n", " except :\n", " retval = (company['permalink'],company['name'], company['category_code'], None, None, None, 'error', json_data)\n", " \n", " #close connections after we're done! \n", " cb_conn.close()\n", " s3conn.close()\n", " #sleep for 1/6th of a second\n", " time.sleep(0.16)\n", " return retval\n", " \n", "#get the essentials out of the way\n", "load_essentials()\n", "\n", "#load client\n", "client = Client()\n", "directview = client.direct_view()\n", "directview.push(dict(cred=cred, api_key=api_key, ep=ep, url=url, gen_s3_obj=gen_s3_obj))\n", "\n", "#make sure the necessary things are imported across clients\n", "with directview.sync_imports():\n", " import time, json\n", " from boto.s3.connection import S3Connection\n", " from httplib import HTTPConnection\n", "\n", "#initiate connection with S3\n", "#get s3bucket\n", "s3conn, s3bucket = gen_s3_obj(cred, bucket='kandlikards')\n", "\n", "#check if there is a json file of companies in S3\n", "#retrieve a json document with the companies\n", "companies_key='crunchbase/companies.json'\n", "if not s3bucket.get_key(companies_key):\n", " companies = fetch_and_file_comp_db()\n", "else :\n", " companies = fetch_and_file_comp_db(s3bucket.get_key(companies_key))\n", "\n", "results = directview.map_async(fetch_company, companies[:5000])\n", "\n", "s3conn.close()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#connect to companies endpoint\n", "#getting information about every single company and filing it into companies.db\n", "import json\n", "import httplib\n", "import pprint as pp\n", "import sqlite3\n", "import codecs\n", "\n", "API_KEY = open('/Users/gk/Desktop/github/keys/cb', 'r').read().strip()\n", "OUTPUTDIR='/Users/gk/Desktop/data_sci/crunchbase/data'\n", "URL = 'api.crunchbase.com'\n", "COMPANIES_EL = '/v/1/companies.js?'\n", "COMPANY_EI = '/v/1/company/'\n", "EXCEPTION_FILE= ''.join((OUTPUTDIR,'/exceptions.txt'))\n", "\n", "\n", "def initiate_connection(url):\n", "\treturn httplib.HTTPConnection(url)\n", "\n", "def generate_description_string(company_detail):\n", " return '\\n'.join(filter(None, [company_detail[key] for key in ['overview', 'description', 'tag_list']]))\n", "\n", "def get_company_overview(cb_conn,permalink):\n", " global COMPANY_EI, API_KEY, EXCEPTION_FILE\n", " cb_conn.request(\"GET\",''.join((COMPANY_EI,permalink,'.js?api_key=',API_KEY)))\n", " return generate_description_string(json.loads(cb_conn.getresponse().read()))\n", "\n", "def db_vals(conn, company):\n", " code = lambda x: x if x != '' else None\n", " name = company['name']\n", " permalink = company['permalink']\n", " cat = code(company['category_code'])\n", " descr = get_company_overview(conn, permalink)\n", " return permalink, name, cat, descr\n", " \n", "\n", "def send_companies_query(conn, query, sql_connection):\n", " global EXCEPTION_FILE\n", " count = 0\n", " conn.request(\"GET\", query)\n", " val = conn.getresponse().read().replace('][',',')\n", " response = json.loads(val)\n", " for company in response:\n", " try :\n", " sql_connection.execute('''INSERT INTO companies VALUES (?, ?, ?, ?)''', db_vals(conn,company))\n", " count += 1\n", " if count % 100 == 0 :\n", " sql_connection.commit()\n", " print \"Count: %d\" % count\n", " except :\n", " with codecs.open(EXCEPTION_FILE, 'a') as f:\n", " f.write('\\n')\n", " f.write ('=======ERROR=======\\n')\n", " f.write(str(company))\n", " sql_connection.commit()\n", "\n", "def setup_companies_table():\n", " global OUTPUTDIR\n", " sql_connection = sqlite3.connect(''.join((OUTPUTDIR,\"/companies.db\")))\n", " sql_connection.cursor()\n", " sql_connection.execute('''DROP TABLE IF EXISTS companies''')\n", " sql_connection.execute('''CREATE TABLE companies (permalink text primary key, name text, cat text, descr text)''')\n", " sql_connection.commit()\n", " return sql_connection\n", " \n", "def companies():\n", " global URL, COMPANIES_EL, API_KEY\n", " cb_connection=initiate_connection(URL)\n", " sql_connection = setup_companies_table()\n", " query = \"%sapi_key=%s\" % (COMPANIES_EL, API_KEY)\n", " send_companies_query (cb_connection, query, sql_connection)\n", " cb_connection.close()\n", " sql_connection.close()\n", "\n", "if __name__ == '__main__':\n", "\tcompanies()" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "'''Write to files'''" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "#connect to companies endpoint\n", "#getting information about every single company and filing it into companies.db\n", "import json\n", "import httplib\n", "import sqlite3\n", "import codecs\n", "import sys\n", "\n", "PATH = '/Users/gk/Desktop/data_sci/crunchbase'\n", "API_KEY = open(''.join((PATH,'/keys/cb')), 'r').read().strip()\n", "OUTPUTDIR=''.join((PATH,'/data'))\n", "URL = 'api.crunchbase.com'\n", "COMPANIES_EL = '/v/1/companies.js?'\n", "COMPANY_EI = '/v/1/company/'\n", "EXCEPTION_FILE= ''.join((OUTPUTDIR,'/exceptions.txt'))\n", "\n", "\n", "def initiate_connection(url):\n", "\treturn httplib.HTTPConnection(url)\n", "\n", "def fetch_company_file(cb_conn, company):\n", " global COMPANY_EI, API_KEY, EXCEPTION_FILE, OUTPUTDIR\n", " query=''.join((COMPANY_EI,company['permalink'],'.js?api_key=',API_KEY))\n", " company_file=''.join((OUTPUTDIR,'/',company['permalink'],'.json'))\n", " cb_conn.request(\"GET\",query)\n", " data=cb_conn.getresponse().read()\n", " with open(company_file, 'r+') as f:\n", " if len(f.read()) < 1:\n", " f.write(data)\n", "\n", " \n", "def fetch_companies(cb_conn, companies_file):\n", " global EXCEPTION_FILE\n", " with open(''.join((PATH,'/data/count.txt'))) as count_file:\n", " count = int(count_file.read().strip())\n", " with open(companies_file) as f:\n", " companies = json.loads(f.read(), strict=False)\n", " for company in companies[count:]:\n", " if count > 200:\n", " break\n", " fetch_company_file(cb_conn, company)\n", " except :\n", " extype, value = sys.exc_info()[:2]\n", " with open(EXCEPTION_FILE, 'a') as g:\n", " g.write('\\n=======ERROR: FETCH_COMPANIES=======')\n", " g.write('\\nTYPE: %s \\t VALUE: %s' % (extype, value))\n", " g.write('\\n'+str(company))\n", " count += 1\n", " with open(''.join((PATH,'/data/count.txt')),'w') as count_file:\n", " count_file.write(str(count))\n", "\n", "def fetch_companies_file(cb_conn):\n", " global COMPANIES_EL, API_KEY, OUTPUTDIR, EXCEPTION_FILE\n", " json_file=''.join((OUTPUTDIR,'/companies.json'))\n", " with open(json_file, 'r+') as f:\n", " if len(f.read()) < 1:\n", " query = ''.join((COMPANIES_EL,\"api_key=\",API_KEY))\n", " cb_conn.request(\"GET\", query)\n", " f.write(cb_conn.getresponse().read().replace('][',',')) \n", " return json_file\n", "\n", "def companies():\n", " global URL\n", " cb_connection=initiate_connection(URL)\n", " companies_file = fetch_companies_file(cb_connection)\n", " fetch_companies(cb_connection, companies_file)\n", " cb_connection.close()\n", "\n", "if __name__ == '__main__':\n", "\tcompanies()\n", "\n" ], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "import IPython, json\n", "import pandas as pd\n", "from IPython.parallel import Client\n", "\n", "def get_cb_apikey(path):\n", " retval=''\n", " with open(''.join((path,'/crunchbase/key')), 'r') as f:\n", " retval = f.read().strip()\n", " return retval\n", "\n", "def get_s3_bucket(credentials):\n", " from boto.s3.connection import S3Connection\n", " s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])\n", " s3bucket = s3conn.get_bucket('kandlikards')\n", " return s3conn, s3bucket\n", "\n", "def get_cb_conn():\n", " from httplib import HTTPConnection\n", " return HTTPConnection('api.crunchbase.com')\n", "\n", " \n", "def fetch_comp_json(s3bucket):\n", " import json\n", " \n", " #load the companies file as a json object\n", " #return the json object\n", " \n", " companies_key=s3bucket.get_key('crunchbase/companies.json')\n", " return json.loads(companies_key.get_contents_as_string(), strict=False)\n", "\n", "\n", "\n", "#local paths\n", "path = '/Users/gk/Desktop/data_sci'\n", "#path='/home/gkandlikar'\n", "\n", "#crunchbase stuff\n", "apikey = get_cb_apikey(path)\n", "url = 'api.crunchbase.com'\n", "ep = '/v/1/company/'\n", "\n", "#S3 stuff\n", "#credentials = pd.read_csv(''.join((PATH,'/credentials.csv')))\n", "cred = pd.read_csv(''.join((path, '/credentials/credentials.csv')))\n", "s3conn, s3bucket =get_s3_bucket(cred)\n", "\n", "#get the json object that holds references to all the companies\n", "#companies = fetch_comp_json(s3bucket)\n", "\n", "existing_companies=s3bucket.list('crunchbase/data')\n", "\n", "#cluster stuff\n", "c=Client()\n", "dview = c.direct_view()\n", "\n", "#make it non-blocking so it frees us up to do something else with this notebook.\n", "dview.block=False\n", "\n", "#spread out the companies among the nodes in the cluster\n", "dview.scatter('existing_companies', [company for company in existing_companies], dist='r')\n", " \n", "#pass in the values which are going to be used to fetch each file\n", "#notice that I am not using %%px, so I my function is only being defined\n", "#locally. This is why I have to pass functions like get_s3_bucket and get_cb_conn\n", "\n", "#My other option was to pass them in as arguments for the\n", "# fetch_company_files function, but I thought this was a better way of going about it.\n", "\n", "dview.push(dict(apikey=apikey, url=url, ep=ep, cred=cred, get_cb_conn=get_cb_conn, get_s3_bucket=get_s3_bucket, fetch_company_files=fetch_company_files))\n", "\n", "try:\n", " %px fetch_company_files()\n", " val = reduce(lambda x, y: x+y, result)\n", " print ('Retrieved %d companies' % val)\n", "\n", "except :\n", " print 'fail'\n", " pass\n", "\n", "\n", "#close yo connections before you wreck yo connections\n", "s3conn.close()" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "fail\n" ] } ], "prompt_number": 21 }, { "cell_type": "code", "collapsed": false, "input": [ "lsmagic" ], "language": "python", "metadata": {}, "outputs": [ { "json": [ "{\"cell\": {\"prun\": \"ExecutionMagics\", \"file\": \"Other\", \"!\": \"OSMagics\", \"capture\": \"ExecutionMagics\", \"timeit\": \"ExecutionMagics\", \"script\": \"ScriptMagics\", \"ruby\": \"Other\", \"px\": \"ParallelMagics\", \"system\": \"OSMagics\", \"perl\": \"Other\", \"HTML\": \"Other\", \"bash\": \"Other\", \"python\": \"Other\", \"SVG\": \"Other\", \"javascript\": \"DisplayMagics\", \"writefile\": \"OSMagics\", \"pypy\": \"Other\", \"python3\": \"Other\", \"latex\": \"DisplayMagics\", \"sx\": \"OSMagics\", \"svg\": \"DisplayMagics\", \"html\": \"DisplayMagics\", \"sh\": \"Other\", \"time\": \"ExecutionMagics\", \"debug\": \"ExecutionMagics\"}, \"line\": {\"psource\": \"NamespaceMagics\", \"logstart\": \"LoggingMagics\", \"popd\": \"OSMagics\", \"loadpy\": \"CodeMagics\", \"install_ext\": \"ExtensionMagics\", \"colors\": \"BasicMagics\", \"who_ls\": \"NamespaceMagics\", \"install_profiles\": \"DeprecatedMagics\", \"pprint\": \"BasicMagics\", \"save\": \"CodeMagics\", \"tb\": \"ExecutionMagics\", \"pylab\": \"PylabMagics\", \"killbgscripts\": \"ScriptMagics\", \"quickref\": \"BasicMagics\", \"magic\": \"BasicMagics\", \"dhist\": \"OSMagics\", \"edit\": \"KernelMagics\", \"logstop\": \"LoggingMagics\", \"gui\": \"BasicMagics\", \"alias_magic\": \"BasicMagics\", \"debug\": \"ExecutionMagics\", \"page\": \"BasicMagics\", \"logstate\": \"LoggingMagics\", \"ed\": \"Other\", \"pushd\": \"OSMagics\", \"timeit\": \"ExecutionMagics\", \"rehashx\": \"OSMagics\", \"hist\": \"Other\", \"qtconsole\": \"KernelMagics\", \"autopx\": \"ParallelMagics\", \"dirs\": \"OSMagics\", \"run\": \"ExecutionMagics\", \"reset_selective\": \"NamespaceMagics\", \"pinfo2\": \"NamespaceMagics\", \"matplotlib\": \"PylabMagics\", \"unload_ext\": \"ExtensionMagics\", \"doctest_mode\": \"KernelMagics\", \"logoff\": \"LoggingMagics\", \"reload_ext\": \"ExtensionMagics\", \"history\": \"HistoryMagics\", \"pdb\": \"ExecutionMagics\", \"load\": \"CodeMagics\", \"lsmagic\": \"BasicMagics\", \"autosave\": \"KernelMagics\", \"cd\": \"OSMagics\", \"pastebin\": \"CodeMagics\", \"prun\": \"ExecutionMagics\", \"pxresult\": \"ParallelMagics\", \"autocall\": \"AutoMagics\", \"bookmark\": \"OSMagics\", \"connect_info\": \"KernelMagics\", \"px\": \"ParallelMagics\", \"system\": \"OSMagics\", \"whos\": \"NamespaceMagics\", \"automagic\": \"AutoMagics\", \"store\": \"StoreMagics\", \"more\": \"KernelMagics\", \"pdef\": \"NamespaceMagics\", \"precision\": \"BasicMagics\", \"pinfo\": \"NamespaceMagics\", \"pwd\": \"OSMagics\", \"psearch\": \"NamespaceMagics\", \"reset\": \"NamespaceMagics\", \"recall\": \"HistoryMagics\", \"xdel\": \"NamespaceMagics\", \"xmode\": \"BasicMagics\", \"rerun\": \"HistoryMagics\", \"logon\": \"LoggingMagics\", \"result\": \"ParallelMagics\", \"pycat\": \"OSMagics\", \"pxconfig\": \"ParallelMagics\", \"unalias\": \"OSMagics\", \"install_default_config\": \"DeprecatedMagics\", \"env\": \"OSMagics\", \"load_ext\": \"ExtensionMagics\", \"config\": \"ConfigMagics\", \"profile\": \"BasicMagics\", \"pfile\": \"NamespaceMagics\", \"less\": \"KernelMagics\", \"who\": \"NamespaceMagics\", \"notebook\": \"BasicMagics\", \"man\": \"KernelMagics\", \"sx\": \"OSMagics\", \"macro\": \"ExecutionMagics\", \"clear\": \"KernelMagics\", \"alias\": \"OSMagics\", \"time\": \"ExecutionMagics\", \"sc\": \"OSMagics\", \"rep\": \"Other\", \"pdoc\": \"NamespaceMagics\"}}" ], "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "Available line magics:\n", "%alias %alias_magic %autocall %automagic %autopx %autosave %bookmark %cd %clear %colors %config %connect_info %debug %dhist %dirs %doctest_mode %ed %edit %env %gui %hist %history %install_default_config %install_ext %install_profiles %killbgscripts %less %load %load_ext %loadpy %logoff %logon %logstart %logstate %logstop %lsmagic %macro %magic %man %matplotlib %more %notebook %page %pastebin %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %popd %pprint %precision %profile %prun %psearch %psource %pushd %pwd %px %pxconfig %pxresult %pycat %pylab %qtconsole %quickref %recall %rehashx %reload_ext %rep %rerun %reset %reset_selective %result %run %save %sc %store %sx %system %tb %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode\n", "\n", "Available cell magics:\n", "%%! %%HTML %%SVG %%bash %%capture %%debug %%file %%html %%javascript %%latex %%perl %%prun %%px %%pypy %%python %%python3 %%ruby %%script %%sh %%svg %%sx %%system %%time %%timeit %%writefile\n", "\n", "Automagic is ON, % prefix IS NOT needed for line magics." ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }