{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# IMDB datasets\n", "This notebook contains the Python-3 script to get IMDB movie data into a database from their downloadable datasets. It assumes you have psycopg2, which you can install from conda. I recommend making a **new** conda environment, so your _root_ environment won't get corrupted. For example, the following command in a conda shell would create a new Python-3.6 environment with psycopg2.\n", "\n", " $ conda create -n py36-imdb python=3.6.3 psycopg2\n", "\n", "[psycopg2](http://initd.org/psycopg/) is a binding to the [PostgreSQL](https://www.postgresql.org/) C-API that uses the [Python DB-API 2.0 specification](https://www.python.org/dev/peps/pep-0249/). This means it uses the same syntax to connect and execute queries as the [builtin `sqlite3` Python module](https://docs.python.org/3.6/library/sqlite3.html)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# imports\n", "import csv\n", "import gzip\n", "import os\n", "import urllib\n", "import psycopg2, psycopg2.extras\n", "import logging\n", "\n", "# logging\n", "logging.basicConfig()\n", "LOGGER = logging.getLogger(__name__)\n", "LOGGER.setLevel(logging.DEBUG)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# meta data about the URL, the data tarballs, and their schema\n", "IMDB_datasets = 'https://datasets.imdbws.com/'\n", "datasets = {\n", " 'title_basics': 'title.basics.tsv.gz', # information for titles\n", " 'title_crew': 'title.crew.tsv.gz', # the director and writer information for all the titles in IMDb\n", " 'title_episode': 'title.episode.tsv.gz', # the tv episode information\n", " 'title_principals': 'title.principals.tsv.gz',\n", " 'title_ratings': 'title.ratings.tsv.gz',\n", " 'name_basics': 'name.basics.tsv.gz'\n", "}\n", "DETAILS = {\n", " 'title_basics': [\n", " ('tconst', 'TEXT PRIMARY KEY'), ('titleType', 'TEXT'), ('primaryTitle', 'TEXT'),\n", " ('originalTitle', 'TEXT'), ('isAdult', 'INT'), ('startYear', 'INT'),\n", " ('endYear', 'INT'), ('runtimeMinutes', 'INT'), ('genres', 'TEXT')\n", " ],\n", "}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# info to connect to an online database set up for this tutorial\n", "DBNAME = 'breaking-bytes_imdb'\n", "HOST = 'postgresql-breaking-bytes.alwaysdata.net'\n", "PORT = 5432\n", "USER = os.getenv('USER')\n", "PASSWORD = os.getenv('PASSWORD')\n", "\n", "# connect to the database and get a Python DB-API 2.0 \"cursor\"\n", "conn = psycopg2.connect(dbname=DBNAME, user=USER, password=PASSWORD, host=HOST, port=PORT)\n", "cur = conn.cursor()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# create title.basic table\n", "EXPR = ('CREATE TABLE title_basics(%s, %s, %s, %s, %s, %s, %s, %s, %s);'\n", " % tuple(' '.join(kv) for kv in DETAILS['title_basics']))\n", "cur.execute(EXPR)\n", "conn.commit()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# get the 'title.basics' dataset\n", "with urllib.request.urlopen(IMDB_datasets + datasets['title_basics']) as http_response:\n", " tsv = gzip.decompress(http_response.read())" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DEBUG:__main__:['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear', 'endYear', 'runtimeMinutes', 'genres']\n" ] } ], "source": [ "# split the lines into rows\n", "rows = tsv.decode('utf-8').split('\\n')\n", "\n", "LOGGER.debug(rows[0].split('\\t')) # take a look at the headers\n", "\n", "# skip the first row with the headers and the last row with only a newline\n", "rows = rows[1:-1]\n", "\n", "# split the rows into columns\n", "records = [row.split('\\t') for row in rows]\n", "\n", "# replace IMDB's NULL, \"\\\\N\", which None \n", "records = [tuple(None if r == '\\\\N' else r for r in record)\n", " for record in records]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "DEBUG:__main__:loop: 0, start: 0, stop: 48137\n", "DEBUG:__main__:loop: 1, start: 48137, stop: 96274\n", "DEBUG:__main__:loop: 2, start: 96274, stop: 144411\n", "DEBUG:__main__:loop: 3, start: 144411, stop: 192548\n", "DEBUG:__main__:loop: 4, start: 192548, stop: 240685\n", "DEBUG:__main__:loop: 5, start: 240685, stop: 288822\n", "DEBUG:__main__:loop: 6, start: 288822, stop: 336959\n", "DEBUG:__main__:loop: 7, start: 336959, stop: 385096\n", "DEBUG:__main__:loop: 8, start: 385096, stop: 433233\n", "DEBUG:__main__:loop: 9, start: 433233, stop: 481370\n", "DEBUG:__main__:loop: 10, start: 481370, stop: 529507\n", "DEBUG:__main__:loop: 11, start: 529507, stop: 577644\n", "DEBUG:__main__:loop: 12, start: 577644, stop: 625781\n", "DEBUG:__main__:loop: 13, start: 625781, stop: 673918\n", "DEBUG:__main__:loop: 14, start: 673918, stop: 722055\n", "DEBUG:__main__:loop: 15, start: 722055, stop: 770192\n", "DEBUG:__main__:loop: 16, start: 770192, stop: 818329\n", "DEBUG:__main__:loop: 17, start: 818329, stop: 866466\n", "DEBUG:__main__:loop: 18, start: 866466, stop: 914603\n", "DEBUG:__main__:loop: 19, start: 914603, stop: 962740\n", "DEBUG:__main__:loop: 20, start: 962740, stop: 1010877\n", "DEBUG:__main__:loop: 21, start: 1010877, stop: 1059014\n", "DEBUG:__main__:loop: 22, start: 1059014, stop: 1107151\n", "DEBUG:__main__:loop: 23, start: 1107151, stop: 1155288\n", "DEBUG:__main__:loop: 24, start: 1155288, stop: 1203425\n", "DEBUG:__main__:loop: 25, start: 1203425, stop: 1251562\n", "DEBUG:__main__:loop: 26, start: 1251562, stop: 1299699\n", "DEBUG:__main__:loop: 27, start: 1299699, stop: 1347836\n", "DEBUG:__main__:loop: 28, start: 1347836, stop: 1395973\n", "DEBUG:__main__:loop: 29, start: 1395973, stop: 1444110\n", "DEBUG:__main__:loop: 30, start: 1444110, stop: 1492247\n", "DEBUG:__main__:loop: 31, start: 1492247, stop: 1540384\n", "DEBUG:__main__:loop: 32, start: 1540384, stop: 1588521\n", "DEBUG:__main__:loop: 33, start: 1588521, stop: 1636658\n", "DEBUG:__main__:loop: 34, start: 1636658, stop: 1684795\n", "DEBUG:__main__:loop: 35, start: 1684795, stop: 1732932\n", "DEBUG:__main__:loop: 36, start: 1732932, stop: 1781069\n", "DEBUG:__main__:loop: 37, start: 1781069, stop: 1829206\n", "DEBUG:__main__:loop: 38, start: 1829206, stop: 1877343\n", "DEBUG:__main__:loop: 39, start: 1877343, stop: 1925480\n", "DEBUG:__main__:loop: 40, start: 1925480, stop: 1973617\n", "DEBUG:__main__:loop: 41, start: 1973617, stop: 2021754\n", "DEBUG:__main__:loop: 42, start: 2021754, stop: 2069891\n", "DEBUG:__main__:loop: 43, start: 2069891, stop: 2118028\n", "DEBUG:__main__:loop: 44, start: 2118028, stop: 2166165\n", "DEBUG:__main__:loop: 45, start: 2166165, stop: 2214302\n", "DEBUG:__main__:loop: 46, start: 2214302, stop: 2262439\n", "DEBUG:__main__:loop: 47, start: 2262439, stop: 2310576\n", "DEBUG:__main__:loop: 48, start: 2310576, stop: 2358713\n", "DEBUG:__main__:loop: 49, start: 2358713, stop: 2406850\n", "DEBUG:__main__:loop: 50, start: 2406850, stop: 2454987\n", "DEBUG:__main__:loop: 51, start: 2454987, stop: 2503124\n", "DEBUG:__main__:loop: 52, start: 2503124, stop: 2551261\n", "DEBUG:__main__:loop: 53, start: 2551261, stop: 2599398\n", "DEBUG:__main__:loop: 54, start: 2599398, stop: 2647535\n", "DEBUG:__main__:loop: 55, start: 2647535, stop: 2695672\n", "DEBUG:__main__:loop: 56, start: 2695672, stop: 2743809\n", "DEBUG:__main__:loop: 57, start: 2743809, stop: 2791946\n", "DEBUG:__main__:loop: 58, start: 2791946, stop: 2840083\n", "DEBUG:__main__:loop: 59, start: 2840083, stop: 2888220\n", "DEBUG:__main__:loop: 60, start: 2888220, stop: 2936357\n", "DEBUG:__main__:loop: 61, start: 2936357, stop: 2984494\n", "DEBUG:__main__:loop: 62, start: 2984494, stop: 3032631\n", "DEBUG:__main__:loop: 63, start: 3032631, stop: 3080768\n", "DEBUG:__main__:loop: 64, start: 3080768, stop: 3128905\n", "DEBUG:__main__:loop: 65, start: 3128905, stop: 3177042\n", "DEBUG:__main__:loop: 66, start: 3177042, stop: 3225179\n", "DEBUG:__main__:loop: 67, start: 3225179, stop: 3273316\n", "DEBUG:__main__:loop: 68, start: 3273316, stop: 3321453\n", "DEBUG:__main__:loop: 69, start: 3321453, stop: 3369590\n", "DEBUG:__main__:loop: 70, start: 3369590, stop: 3417727\n", "DEBUG:__main__:loop: 71, start: 3417727, stop: 3465864\n", "DEBUG:__main__:loop: 72, start: 3465864, stop: 3514001\n", "DEBUG:__main__:loop: 73, start: 3514001, stop: 3562138\n", "DEBUG:__main__:loop: 74, start: 3562138, stop: 3610275\n", "DEBUG:__main__:loop: 75, start: 3610275, stop: 3658412\n", "DEBUG:__main__:loop: 76, start: 3658412, stop: 3706549\n", "DEBUG:__main__:loop: 77, start: 3706549, stop: 3754686\n", "DEBUG:__main__:loop: 78, start: 3754686, stop: 3802823\n", "DEBUG:__main__:loop: 79, start: 3802823, stop: 3850960\n", "DEBUG:__main__:loop: 80, start: 3850960, stop: 3899097\n", "DEBUG:__main__:loop: 81, start: 3899097, stop: 3947234\n", "DEBUG:__main__:loop: 82, start: 3947234, stop: 3995371\n", "DEBUG:__main__:loop: 83, start: 3995371, stop: 4043508\n", "DEBUG:__main__:loop: 84, start: 4043508, stop: 4091645\n", "DEBUG:__main__:loop: 85, start: 4091645, stop: 4139782\n", "DEBUG:__main__:loop: 86, start: 4139782, stop: 4187919\n", "DEBUG:__main__:loop: 87, start: 4187919, stop: 4236056\n", "DEBUG:__main__:loop: 88, start: 4236056, stop: 4284193\n", "DEBUG:__main__:loop: 89, start: 4284193, stop: 4332330\n", "DEBUG:__main__:loop: 90, start: 4332330, stop: 4380467\n", "DEBUG:__main__:loop: 91, start: 4380467, stop: 4428604\n", "DEBUG:__main__:loop: 92, start: 4428604, stop: 4476741\n", "DEBUG:__main__:loop: 93, start: 4476741, stop: 4524878\n", "DEBUG:__main__:loop: 94, start: 4524878, stop: 4573015\n", "DEBUG:__main__:loop: 95, start: 4573015, stop: 4621152\n", "DEBUG:__main__:loop: 96, start: 4621152, stop: 4669289\n", "DEBUG:__main__:loop: 97, start: 4669289, stop: 4717426\n", "DEBUG:__main__:loop: 98, start: 4717426, stop: 4765563\n", "DEBUG:__main__:loop: 99, start: 4765563, stop: 4813700\n" ] } ], "source": [ "# copy this data to the database\n", "COUNT = len(records)\n", "LOOPS = 100\n", "CHUNKS = COUNT // LOOPS\n", "\n", "idx = 0\n", "for n in range(LOOPS):\n", " jdx = idx + CHUNKS\n", " LOGGER.debug('loop: %d, start: %d, stop: %d', n, idx, jdx)\n", " psycopg2.extras.execute_values(cur, 'INSERT INTO title_basics VALUES %s;', records[idx:jdx])\n", " conn.commit()\n", " idx += CHUNKS" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "cur.close()\n", "conn.close()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4813797\n" ] } ], "source": [ "print(COUNT)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.3" } }, "nbformat": 4, "nbformat_minor": 2 }