{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Conver the original data\n", "\n", "obtained from https://www.transtats.bts.gov/DL_SelectFields.asp\n", "\n", "on Sunday, 8-Sept-2019" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Converting the original airlines data to `hdf5`" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T20:16:31.517614Z", "start_time": "2019-09-19T20:16:29.445382Z" } }, "outputs": [], "source": [ "import os \n", "import glob\n", "from zipfile import ZipFile\n", "\n", "import numpy as np\n", "\n", "import pandas as pd\n", "pd.set_option('display.max_columns', 500)\n", "\n", "import vaex\n", "\n", "from tqdm import tqdm_notebook as tqdm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Settings" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T10:09:42.429415Z", "start_time": "2019-09-19T10:09:42.422859Z" } }, "outputs": [], "source": [ "# Columns to read\n", "columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', \n", " 'Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number', \n", " 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',\n", " 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', \n", " 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']\n", "\n", "# Force dtypes for these columns - some choices are made to optimize disk space\n", "dtypes = {\n", "'Year': np.int16,\n", "'Month': np.int8,\n", "'DayofMonth': np.int16,\n", "'DayOfWeek': np.int8,\n", "'DepTime': 'Int16',\n", "'CRSDepTime': 'Int16',\n", "'ArrTime': 'Int16',\n", "'CRSArrTime': 'Int16',\n", "'Reporting_Airline': np.object,\n", "'Flight_Number_Reporting_Airline': np.int32,\n", "'Tail_Number': np.object,\n", "'ActualElapsedTime': 'Int32',\n", "'CRSElapsedTime': 'Int32',\n", "'AirTime': 'Int32',\n", "'ArrDelay': 'Int32',\n", "'DepDelay': 'Int32',\n", "'Origin': np.object,\n", "'Dest': np.object,\n", "'Distance': 'Int32',\n", "'TaxiIn': 'Int32',\n", "'TaxiOut': 'Int32',\n", "'Cancelled': 'Int8',\n", "'CancellationCode': np.object,\n", "'Diverted': 'Int8',\n", "'CarrierDelay': 'Int32',\n", "'WeatherDelay': 'Int32',\n", "'NASDelay': 'Int32',\n", "'SecurityDelay': 'Int32',\n", "'LateAircraftDelay': 'Int32',\n", "}\n", "\n", "# Set up a renaming dictionary, in order to make the column names to match the well known data from \n", "# http://stat-computing.org/dataexpo/2009/the-data.html\n", "rename_dict = {\n", " 'DayofMonth': 'DayOfMonth',\n", " 'Reporting_Airline': 'UniqueCarrier',\n", " 'Flight_Number_Reporting_Airline': 'FlightNum',\n", " 'Tail_Number': 'TailNum'\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Conversion" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T10:09:44.179421Z", "start_time": "2019-09-19T10:09:44.172912Z" } }, "outputs": [], "source": [ "# Set up the list of zip files to be opened and converted\n", "zip_list = np.sort(np.array(glob.glob('./airlines-us-original/raw/*.zip')))[::-1]\n", "\n", "# The output directory\n", "output_dir = './airlines-us-original/hdf5/'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T11:42:14.186704Z", "start_time": "2019-09-19T10:09:52.752002Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Converting to hdf5...', max=372, style=ProgressStyle(descript…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\r" ] } ], "source": [ "# The magic happens heree:\n", "for file in tqdm(zip_list, leave=False, desc='Converting to hdf5...'):\n", " # Setting up the files, and directories\n", " zip_file = ZipFile(file)\n", " output_file = file.split('/')[-1][:-3]+'hdf5'\n", " output = output_dir + output_file\n", " \n", " # Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it\n", " if (os.path.exists(output) and os.path.isfile(output)):\n", " pass\n", " else:\n", " # Importing the data into pandas \n", " pandas_df = [pd.read_csv(zip_file.open(text_file.filename), \n", " encoding='latin',\n", " usecols=columns,\n", " dtype=dtypes,)\n", " for text_file in zip_file.infolist()\n", " if text_file.filename.endswith('.csv')][0]\n", " # Rename some columns to match the more well known dataset from \n", " # http://stat-computing.org/dataexpo/2009/the-data.html\n", " pandas_df.rename(columns=rename_dict, inplace=True)\n", "\n", " # Importing the data from pandas to vaex\n", " vaex_df = vaex.from_pandas(pandas_df, copy_index=False)\n", " \n", " # Export the data with vaex to hdf5\n", " vaex_df.export_hdf5(path=output, progress=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notes:\n", " - number of columns in the Data expo version: 29\n", "\n", "Here we go:\n", "1. Year -> Year\n", "- Month -> Month\n", "- DayOfMonth -> DayofMonth\n", "- DayOfWeek -> DayOfWeek\n", "- DepTime -> DepTime\n", "- CRSDepTime -> CRSDepTime\n", "- ArrTime -> ArrTime \n", "- CRSArrTime -> CRSArrTime \n", "- Reporting_Airline -> UniqueCarrier\n", "- Flight_Number_Reporting_Airline -> FlightNum\n", "- Tail_Number -> TailNum\n", "- ActualElapsedTime -> ActualElapsedTime\n", "- CRSElapsedTime -> CRSElapsedTime\n", "- AirTime -> AirTime\n", "- ArrDelay -> ArrDelay\n", "- DepDelay -> DepDelay\n", "- Origin -> Origin\n", "- Dest -> Dest\n", "- Distance -> Distance\n", "- TaxiIn -> TaxiIn\n", "- TaxiOut -> TaxiOut\n", "- Cancelled -> Cancelled\n", "- CancellationCode -> CancellationCode\n", "- Diverted -> Diverted\n", "- CarrierDelay -> CarrierDelay\n", "- WeatherDelay -> WeatherDelay \n", "- NASDelay -> NASDelay \n", "- SecurityDelay -> SecurityDelay\n", "- LateAircraftDelay -> LateAircraftDelay" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now merge all the little files together\n", "\n", "Note that macOS has a rather low limit on the number of files one can open at one time. To circumvent this issue, in a terminal run:\n", "\n", "`>ulimit -n 9999`\n", "\n", "It should be the same terminal from which later the jupyter server is started.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T11:52:13.445615Z", "start_time": "2019-09-19T11:52:11.931233Z" } }, "outputs": [], "source": [ "import re\n", "import glob\n", "import vaex\n", "import numpy as np\n", "\n", "def tryint(s):\n", " try:\n", " return int(s)\n", " except:\n", " return s\n", "\n", "def alphanum_key(s):\n", " \"\"\" Turn a string into a list of string and number chunks.\n", " \"z23a\" -> [\"z\", 23, \"a\"]\n", " \"\"\"\n", " return [ tryint(c) for c in re.split('([0-9]+)', s) ]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T11:52:13.454973Z", "start_time": "2019-09-19T11:52:13.446990Z" } }, "outputs": [], "source": [ "hdf5_list = glob.glob('./airlines-us-original/hdf5/*.hdf5')\n", "hdf5_list.sort(key=alphanum_key)\n", "hdf5_list = np.array(hdf5_list)\n", "\n", "assert len(hdf5_list) == 372, \"Incorrect number of files\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T12:07:34.862383Z", "start_time": "2019-09-19T11:58:04.846356Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[########################################]: 100.00% estimated time: 0s = 0.0m = 0.0h " ] } ], "source": [ "# This is an important step\n", "master_df = vaex.open_many(hdf5_list)\n", "\n", "# exporting\n", "master_df.export_hdf5(path='./airline_data_1988_2018.hd5', progress=True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T12:19:53.818836Z", "start_time": "2019-09-19T12:19:53.771996Z" } }, "outputs": [ { "data": { "text/html": [ "
# | Year | Month | DayOfMonth | DayOfWeek | UniqueCarrier | TailNum | FlightNum | Origin | Dest | CRSDepTime | DepTime | DepDelay | TaxiOut | TaxiIn | CRSArrTime | ArrTime | ArrDelay | Cancelled | CancellationCode | Diverted | CRSElapsedTime | ActualElapsedTime | AirTime | Distance | CarrierDelay | WeatherDelay | NASDelay | SecurityDelay | LateAircraftDelay |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1988 | 1 | 8 | 5 | PI | None | 930 | BGM | ITH | 1525 | 1532 | 7 | -- | -- | 1545 | 1555 | 10 | 0 | None | 0 | 20 | 23 | -- | 32 | -- | -- | -- | -- | -- |
1 | 1988 | 1 | 9 | 6 | PI | None | 930 | BGM | ITH | 1525 | 1522 | -3 | -- | -- | 1545 | 1535 | -10 | 0 | None | 0 | 20 | 13 | -- | 32 | -- | -- | -- | -- | -- |
2 | 1988 | 1 | 10 | 7 | PI | None | 930 | BGM | ITH | 1525 | 1522 | -3 | -- | -- | 1545 | 1534 | -11 | 0 | None | 0 | 20 | 12 | -- | 32 | -- | -- | -- | -- | -- |
3 | 1988 | 1 | 11 | 1 | PI | None | 930 | BGM | ITH | 1525 | -- | -- | -- | -- | 1545 | -- | -- | 1 | None | 0 | 20 | -- | -- | 32 | -- | -- | -- | -- | -- |
4 | 1988 | 1 | 12 | 2 | PI | None | 930 | BGM | ITH | 1525 | 1524 | -1 | -- | -- | 1545 | 1540 | -5 | 0 | None | 0 | 20 | 16 | -- | 32 | -- | -- | -- | -- | -- |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
183,821,921 | 2018 | 12 | 27 | 4 | DL | N705TW | 864 | JFK | SLC | 935 | 929 | -6 | 28 | 9 | 1311 | 1230 | -41 | 0 | None | 0 | 336 | 301 | 264 | 1990 | -- | -- | -- | -- | -- |
183,821,922 | 2018 | 12 | 27 | 4 | DL | N336NB | 865 | MSP | SLC | 1035 | 1030 | -5 | 14 | 10 | 1240 | 1214 | -26 | 0 | None | 0 | 185 | 164 | 140 | 991 | -- | -- | -- | -- | -- |
183,821,923 | 2018 | 12 | 27 | 4 | DL | N945DN | 866 | DEN | MSP | 1054 | 1100 | 6 | 10 | 5 | 1353 | 1350 | -3 | 0 | None | 0 | 119 | 110 | 95 | 680 | -- | -- | -- | -- | -- |
183,821,924 | 2018 | 12 | 27 | 4 | DL | N945DN | 866 | MSP | DEN | 850 | 850 | 0 | 15 | 8 | 1010 | 951 | -19 | 0 | None | 0 | 140 | 121 | 98 | 680 | -- | -- | -- | -- | -- |
183,821,925 | 2018 | 12 | 27 | 4 | DL | N901DE | 867 | ATL | SYR | 1455 | 1500 | 5 | 19 | 3 | 1703 | 1703 | 0 | 0 | None | 0 | 128 | 123 | 101 | 794 | -- | -- | -- | -- | -- |