{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Conver the original data\n", "\n", "obtained from https://www.transtats.bts.gov/DL_SelectFields.asp\n", "\n", "on Sunday, 8-Sept-2019" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Converting the original airlines data to `hdf5`" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T20:16:31.517614Z", "start_time": "2019-09-19T20:16:29.445382Z" } }, "outputs": [], "source": [ "import os \n", "import glob\n", "from zipfile import ZipFile\n", "\n", "import numpy as np\n", "\n", "import pandas as pd\n", "pd.set_option('display.max_columns', 500)\n", "\n", "import vaex\n", "\n", "from tqdm import tqdm_notebook as tqdm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Settings" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T10:09:42.429415Z", "start_time": "2019-09-19T10:09:42.422859Z" } }, "outputs": [], "source": [ "# Columns to read\n", "columns = ['Year', 'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', \n", " 'Reporting_Airline', 'Flight_Number_Reporting_Airline', 'Tail_Number', \n", " 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest',\n", " 'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', \n", " 'Diverted', 'CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']\n", "\n", "# Force dtypes for these columns - some choices are made to optimize disk space\n", "dtypes = {\n", "'Year': np.int16,\n", "'Month': np.int8,\n", "'DayofMonth': np.int16,\n", "'DayOfWeek': np.int8,\n", "'DepTime': 'Int16',\n", "'CRSDepTime': 'Int16',\n", "'ArrTime': 'Int16',\n", "'CRSArrTime': 'Int16',\n", "'Reporting_Airline': np.object,\n", "'Flight_Number_Reporting_Airline': np.int32,\n", "'Tail_Number': np.object,\n", "'ActualElapsedTime': 'Int32',\n", "'CRSElapsedTime': 'Int32',\n", "'AirTime': 'Int32',\n", "'ArrDelay': 'Int32',\n", "'DepDelay': 'Int32',\n", "'Origin': np.object,\n", "'Dest': np.object,\n", "'Distance': 'Int32',\n", "'TaxiIn': 'Int32',\n", "'TaxiOut': 'Int32',\n", "'Cancelled': 'Int8',\n", "'CancellationCode': np.object,\n", "'Diverted': 'Int8',\n", "'CarrierDelay': 'Int32',\n", "'WeatherDelay': 'Int32',\n", "'NASDelay': 'Int32',\n", "'SecurityDelay': 'Int32',\n", "'LateAircraftDelay': 'Int32',\n", "}\n", "\n", "# Set up a renaming dictionary, in order to make the column names to match the well known data from \n", "# http://stat-computing.org/dataexpo/2009/the-data.html\n", "rename_dict = {\n", " 'DayofMonth': 'DayOfMonth',\n", " 'Reporting_Airline': 'UniqueCarrier',\n", " 'Flight_Number_Reporting_Airline': 'FlightNum',\n", " 'Tail_Number': 'TailNum'\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Conversion" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T10:09:44.179421Z", "start_time": "2019-09-19T10:09:44.172912Z" } }, "outputs": [], "source": [ "# Set up the list of zip files to be opened and converted\n", "zip_list = np.sort(np.array(glob.glob('./airlines-us-original/raw/*.zip')))[::-1]\n", "\n", "# The output directory\n", "output_dir = './airlines-us-original/hdf5/'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T11:42:14.186704Z", "start_time": "2019-09-19T10:09:52.752002Z" } }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "", "version_major": 2, "version_minor": 0 }, "text/plain": [ "HBox(children=(IntProgress(value=0, description='Converting to hdf5...', max=372, style=ProgressStyle(descript…" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\r" ] } ], "source": [ "# The magic happens heree:\n", "for file in tqdm(zip_list, leave=False, desc='Converting to hdf5...'):\n", " # Setting up the files, and directories\n", " zip_file = ZipFile(file)\n", " output_file = file.split('/')[-1][:-3]+'hdf5'\n", " output = output_dir + output_file\n", " \n", " # Check if a converted file already exists: if it does skip it, otherwise read in the raw csv and convert it\n", " if (os.path.exists(output) and os.path.isfile(output)):\n", " pass\n", " else:\n", " # Importing the data into pandas \n", " pandas_df = [pd.read_csv(zip_file.open(text_file.filename), \n", " encoding='latin',\n", " usecols=columns,\n", " dtype=dtypes,)\n", " for text_file in zip_file.infolist()\n", " if text_file.filename.endswith('.csv')][0]\n", " # Rename some columns to match the more well known dataset from \n", " # http://stat-computing.org/dataexpo/2009/the-data.html\n", " pandas_df.rename(columns=rename_dict, inplace=True)\n", "\n", " # Importing the data from pandas to vaex\n", " vaex_df = vaex.from_pandas(pandas_df, copy_index=False)\n", " \n", " # Export the data with vaex to hdf5\n", " vaex_df.export_hdf5(path=output, progress=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notes:\n", " - number of columns in the Data expo version: 29\n", "\n", "Here we go:\n", "1. Year -> Year\n", "- Month -> Month\n", "- DayOfMonth -> DayofMonth\n", "- DayOfWeek -> DayOfWeek\n", "- DepTime -> DepTime\n", "- CRSDepTime -> CRSDepTime\n", "- ArrTime -> ArrTime \n", "- CRSArrTime -> CRSArrTime \n", "- Reporting_Airline -> UniqueCarrier\n", "- Flight_Number_Reporting_Airline -> FlightNum\n", "- Tail_Number -> TailNum\n", "- ActualElapsedTime -> ActualElapsedTime\n", "- CRSElapsedTime -> CRSElapsedTime\n", "- AirTime -> AirTime\n", "- ArrDelay -> ArrDelay\n", "- DepDelay -> DepDelay\n", "- Origin -> Origin\n", "- Dest -> Dest\n", "- Distance -> Distance\n", "- TaxiIn -> TaxiIn\n", "- TaxiOut -> TaxiOut\n", "- Cancelled -> Cancelled\n", "- CancellationCode -> CancellationCode\n", "- Diverted -> Diverted\n", "- CarrierDelay -> CarrierDelay\n", "- WeatherDelay -> WeatherDelay \n", "- NASDelay -> NASDelay \n", "- SecurityDelay -> SecurityDelay\n", "- LateAircraftDelay -> LateAircraftDelay" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Now merge all the little files together\n", "\n", "Note that macOS has a rather low limit on the number of files one can open at one time. To circumvent this issue, in a terminal run:\n", "\n", "`>ulimit -n 9999`\n", "\n", "It should be the same terminal from which later the jupyter server is started.\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T11:52:13.445615Z", "start_time": "2019-09-19T11:52:11.931233Z" } }, "outputs": [], "source": [ "import re\n", "import glob\n", "import vaex\n", "import numpy as np\n", "\n", "def tryint(s):\n", " try:\n", " return int(s)\n", " except:\n", " return s\n", "\n", "def alphanum_key(s):\n", " \"\"\" Turn a string into a list of string and number chunks.\n", " \"z23a\" -> [\"z\", 23, \"a\"]\n", " \"\"\"\n", " return [ tryint(c) for c in re.split('([0-9]+)', s) ]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T11:52:13.454973Z", "start_time": "2019-09-19T11:52:13.446990Z" } }, "outputs": [], "source": [ "hdf5_list = glob.glob('./airlines-us-original/hdf5/*.hdf5')\n", "hdf5_list.sort(key=alphanum_key)\n", "hdf5_list = np.array(hdf5_list)\n", "\n", "assert len(hdf5_list) == 372, \"Incorrect number of files\"" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T12:07:34.862383Z", "start_time": "2019-09-19T11:58:04.846356Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[########################################]: 100.00% estimated time: 0s = 0.0m = 0.0h " ] } ], "source": [ "# This is an important step\n", "master_df = vaex.open_many(hdf5_list)\n", "\n", "# exporting\n", "master_df.export_hdf5(path='./airline_data_1988_2018.hd5', progress=True)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2019-09-19T12:19:53.818836Z", "start_time": "2019-09-19T12:19:53.771996Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "
# Year Month DayOfMonth DayOfWeek UniqueCarrier TailNum FlightNum Origin Dest CRSDepTime DepTime DepDelay TaxiOut TaxiIn CRSArrTime ArrTime ArrDelay Cancelled CancellationCode Diverted CRSElapsedTime ActualElapsedTime AirTime Distance CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay
0 1988 1 8 5 PI None 930 BGM ITH 1525 1532 7 -- -- 1545 1555 10 0 None 0 20 23 -- 32 -- -- -- -- --
1 1988 1 9 6 PI None 930 BGM ITH 1525 1522 -3 -- -- 1545 1535 -10 0 None 0 20 13 -- 32 -- -- -- -- --
2 1988 1 10 7 PI None 930 BGM ITH 1525 1522 -3 -- -- 1545 1534 -11 0 None 0 20 12 -- 32 -- -- -- -- --
3 1988 1 11 1 PI None 930 BGM ITH 1525 -- -- -- -- 1545 -- -- 1 None 0 20 -- -- 32 -- -- -- -- --
4 1988 1 12 2 PI None 930 BGM ITH 1525 1524 -1 -- -- 1545 1540 -5 0 None 0 20 16 -- 32 -- -- -- -- --
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
183,821,9212018 12 27 4 DL N705TW 864 JFK SLC 935 929 -6 28 9 1311 1230 -41 0 None 0 336 301 264 1990 -- -- -- -- --
183,821,9222018 12 27 4 DL N336NB 865 MSP SLC 1035 1030 -5 14 10 1240 1214 -26 0 None 0 185 164 140 991 -- -- -- -- --
183,821,9232018 12 27 4 DL N945DN 866 DEN MSP 1054 1100 6 10 5 1353 1350 -3 0 None 0 119 110 95 680 -- -- -- -- --
183,821,9242018 12 27 4 DL N945DN 866 MSP DEN 850 850 0 15 8 1010 951 -19 0 None 0 140 121 98 680 -- -- -- -- --
183,821,9252018 12 27 4 DL N901DE 867 ATL SYR 1455 1500 5 19 3 1703 1703 0 0 None 0 128 123 101 794 -- -- -- -- --
" ], "text/plain": [ "# Year Month DayOfMonth DayOfWeek UniqueCarrier TailNum FlightNum Origin Dest CRSDepTime DepTime DepDelay TaxiOut TaxiIn CRSArrTime ArrTime ArrDelay Cancelled CancellationCode Diverted CRSElapsedTime ActualElapsedTime AirTime Distance CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay\n", "0 1988 1 8 5 PI None 930 BGM ITH 1525 1532 7 -- -- 1545 1555 10 0 None 0 20 23 -- 32 -- -- -- -- --\n", "1 1988 1 9 6 PI None 930 BGM ITH 1525 1522 -3 -- -- 1545 1535 -10 0 None 0 20 13 -- 32 -- -- -- -- --\n", "2 1988 1 10 7 PI None 930 BGM ITH 1525 1522 -3 -- -- 1545 1534 -11 0 None 0 20 12 -- 32 -- -- -- -- --\n", "3 1988 1 11 1 PI None 930 BGM ITH 1525 -- -- -- -- 1545 -- -- 1 None 0 20 -- -- 32 -- -- -- -- --\n", "4 1988 1 12 2 PI None 930 BGM ITH 1525 1524 -1 -- -- 1545 1540 -5 0 None 0 20 16 -- 32 -- -- -- -- --\n", "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", "183,821,921 2018 12 27 4 DL N705TW 864 JFK SLC 935 929 -6 28 9 1311 1230 -41 0 None 0 336 301 264 1990 -- -- -- -- --\n", "183,821,922 2018 12 27 4 DL N336NB 865 MSP SLC 1035 1030 -5 14 10 1240 1214 -26 0 None 0 185 164 140 991 -- -- -- -- --\n", "183,821,923 2018 12 27 4 DL N945DN 866 DEN MSP 1054 1100 6 10 5 1353 1350 -3 0 None 0 119 110 95 680 -- -- -- -- --\n", "183,821,924 2018 12 27 4 DL N945DN 866 MSP DEN 850 850 0 15 8 1010 951 -19 0 None 0 140 121 98 680 -- -- -- -- --\n", "183,821,925 2018 12 27 4 DL N901DE 867 ATL SYR 1455 1500 5 19 3 1703 1703 0 0 None 0 128 123 101 794 -- -- -- -- --" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Check how the single file looks like:\n", "df = vaex.open('./airline_data_1988_2018.hd5')\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }