{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c616b990-dd1a-422e-89e2-3860b859d64d",
   "metadata": {},
   "source": [
    "# Decode AIS raw data and save to HDF5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "49d38792-6bd0-4a77-a222-c29ea442d99a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyais.stream import FileReaderStream\n",
    "from pathlib import Path\n",
    "import os\n",
    "\n",
    "import h5py\n",
    "import vaex\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import yaml\n",
    "\n",
    "import pooch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "715d6631-fc78-4b62-b1ba-3d6d56464455",
   "metadata": {},
   "outputs": [],
   "source": [
    "vaex.multithreading.thread_count_default = 4"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a8d74654-6eac-4e1b-a4c5-b282778edcc8",
   "metadata": {},
   "source": [
    "## Define constants"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "84f0e06a-87df-4551-9a40-1bb65c103ee3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to meters from radians\n",
    "EARTH_RADIUS_IN_M: int = 6371000"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7426b74a-1df5-4a26-abb3-327ab09ed542",
   "metadata": {},
   "outputs": [],
   "source": [
    "MAX_NUMBER_ROWS = 10000000"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9aae97f3-ce3f-424e-8f1d-3bbc0b9daa3a",
   "metadata": {},
   "source": [
    "## Define paths for input and output files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "92bdeff8-413a-42bb-8ff4-9dc340a4b3bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_folder = \"./data\"\n",
    "input_folder = os.path.join(data_folder, \"input\")\n",
    "output_folder = os.path.join(data_folder, \"decoded\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6e86187-5c90-4091-aa04-86a641bb10e9",
   "metadata": {},
   "source": [
    "### Create folders"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e25ec3b5-20aa-4e5b-ae54-87b455b42e19",
   "metadata": {},
   "outputs": [],
   "source": [
    "for folder in [data_folder, input_folder, output_folder]:\n",
    "    if not os.path.isdir(folder):\n",
    "        os.mkdir(folder)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c7a91979-8509-466a-b4f6-37a1f3020a60",
   "metadata": {},
   "source": [
    "## Get input data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "541cdcc8-3775-489c-acd4-72e6700e121a",
   "metadata": {},
   "outputs": [],
   "source": [
    "input_url = \"https://raw.githubusercontent.com/aduvenhage/ais-decoder/master/data/nmea-sample.txt\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3a1508e3-5b0c-4d8d-9ff4-f172988b11b9",
   "metadata": {},
   "outputs": [],
   "source": [
    "pooch.retrieve(\n",
    "    url=input_url,\n",
    "    known_hash=None,\n",
    "    path=input_folder,\n",
    "    fname=input_url.split(\"/\")[-1]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "73c8335e-a174-4706-b3b9-277a760834f4",
   "metadata": {},
   "source": [
    "## Define column names, types and attributes\n",
    "\n",
    "- Read from XML template file containing information about data group, variables and their associated attributes such as types, missing values, bounds, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc1abca0-7421-404d-b15b-92a49ce43912",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Load parameters\n",
    "h5template_yaml: Path = \"./ais_metadata.yaml\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4745e8c9-b187-44aa-b463-58dc719c15bb",
   "metadata": {},
   "source": [
    "## Analyse, decode and save to HDF5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5515ee77-5f3c-460c-b85e-a9bdc6e4ee4a",
   "metadata": {},
   "outputs": [],
   "source": [
    "class H5AIS:\n",
    "    def __init__(self, ofilename, h5template=None, MAX_NUMBER_ROWS = 2000000):\n",
    "        self.output_filename = ofilename\n",
    "        self.columns = []\n",
    "        self.coltypes = []\n",
    "        self.colmissings = []\n",
    "        self.colsizes = []\n",
    "        self.colattrs = []\n",
    "        self.buffers = []\n",
    "        self.h5file = h5py.File(ofilename, \"w\")\n",
    "        if h5template is None:\n",
    "            self.h5group = \"data\"\n",
    "        else:\n",
    "            self.get_from_template(h5template)\n",
    "        self.MAX_NUMBER_ROWS = MAX_NUMBER_ROWS\n",
    "        self.h5columns = self.h5file.create_group(self.h5group) # vaex reads all datasets in the columns group\n",
    "        \n",
    "    \n",
    "    def get_from_template(self, h5template):\n",
    "        yaml_file = open(f\"{h5template}\")\n",
    "        data_info = yaml.load(yaml_file, Loader=yaml.FullLoader)[\"group\"]\n",
    "        self.h5group = data_info[\"group_name\"]\n",
    "        self.h5template = data_info\n",
    "        i = 0\n",
    "        for var in self.h5template[\"variables\"].keys():\n",
    "            v = self.h5template[\"variables\"][var]\n",
    "            self.columns.append(var)\n",
    "            attrs = { }      \n",
    "            for vk in v.keys():\n",
    "                if \"type\" == vk:\n",
    "                    self.coltypes.append(v[vk])\n",
    "                elif \"missing_value\" == vk:\n",
    "                    self.colmissings.append(v[vk])\n",
    "                else:\n",
    "                    attrs[vk] = v[vk]\n",
    "            self.colattrs.append(attrs)\n",
    "            self.buffers.append([])\n",
    "            i+=1\n",
    "        \n",
    "    def _key2value(self, msg, key):\n",
    "        idx = self.columns.index(key)\n",
    "        attrs = self.colattrs[idx]\n",
    "        if key == \"status\" and \"status\" in msg.asdict():\n",
    "            colval = msg.asdict()[key].value\n",
    "        elif key in msg.asdict():\n",
    "            if \"flag_meanings\" in attrs:\n",
    "                if type(msg.asdict()[key]) == bool:\n",
    "                    colval = int(msg.asdict()[key])\n",
    "                else:\n",
    "                    idx_flag = attrs[\"flag_meanings\"].index(msg.asdict()[key])\n",
    "                    colval = attrs[\"flag_values\"][idx_flag]\n",
    "            elif self.coltypes[idx] == \"str\":\n",
    "                colval = str(msg.asdict()[key])\n",
    "            else:\n",
    "                colval = msg.asdict()[key]\n",
    "        else:\n",
    "            colval = self.colmissings[idx]\n",
    "        \n",
    "        if \"scaling_factor\" in self.colattrs[idx]:\n",
    "            scaling = attrs[\"scaling_factor\"]\n",
    "            colval = scaling * colval\n",
    "        return colval\n",
    "    \n",
    "    def create_column(self, colname, coltype, colmissing, colsize, colattr):\n",
    "        idx = len(self.columns)\n",
    "        self.columns.append(colname)\n",
    "        self.coltypes[colname] = coltype\n",
    "        self.colmissings[colname] = colmissing\n",
    "        self.colsizes.append(colsize)\n",
    "        self.colattrs.append(colattr)\n",
    "        self.buffers.append([])\n",
    "            \n",
    "    def create_columns(self, colnames, coltypes, colmissings, colsizes, colattrs):\n",
    "        print(coltypes)\n",
    "        for cname, ctype, cmiss, csize, cattr in zip(colnames, list(coltypes.values()), list(colmissings.values()), colsizes, colattrs):\n",
    "            self.create_column(cname, ctype, cmiss, csize, cattr)\n",
    "        \n",
    "    def decode_message(self, msg):\n",
    "        try:\n",
    "            decoded_message = msg.decode()\n",
    "            if msg.tag_block is not None:\n",
    "                msg.tag_block.init()\n",
    "                tags = msg.tag_block\n",
    "        except:\n",
    "            print(\" Message \", msg, \" ignored\")\n",
    "        else:\n",
    "            for cname in self.columns:\n",
    "                idx = self.columns.index(cname)\n",
    "                if \"tag_block\" in self.colattrs[idx]:\n",
    "                    if msg.tag_block is not None:\n",
    "                        val = self._key2value(tags, cname)\n",
    "                    else:\n",
    "                        val = self.colmissings[idx]\n",
    "                else:\n",
    "                    val = self._key2value(decoded_message, cname)\n",
    "                self.buffers[idx].append(val) \n",
    "            \n",
    "              \n",
    "    def write(self, colsize=None):\n",
    "        for idx, colname in zip(range(len(self.columns)), self.columns):\n",
    "            if colsize is None:\n",
    "                colsize = self.MAX_NUMBER_ROWS\n",
    "            coltype = self.coltypes[idx]\n",
    "            print(\"Writing Column \", colname, coltype, self.colmissings[idx])\n",
    "            if coltype == \"str\": \n",
    "                coltype = h5py.string_dtype(encoding='utf-8')\n",
    "                ds = self.h5columns.create_dataset(colname, data=np.array(self.buffers[idx], dtype='S'))\n",
    "            else:\n",
    "                ds = self.h5columns.create_dataset(colname, data=np.array(self.buffers[idx], dtype=coltype), dtype=coltype)\n",
    "            attrs = self.colattrs[idx]\n",
    "            for attr in self.colattrs[idx].keys():\n",
    "                ds.attrs[attr] = attrs[attr]\n",
    "\n",
    "    def close(self):\n",
    "        self.h5file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "90d83047-1a77-4f5a-86d0-07aff7963b67",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%%time\n",
    "\n",
    "nb_file = 1\n",
    "\n",
    "input_filename = os.path.join(input_folder, input_url.split(\"/\")[-1])\n",
    "output_filename = os.path.join(output_folder, \"ais_\" + f\"{nb_file:02}\" + \".h5\")\n",
    "\n",
    "list_filenames = []\n",
    "if os.path.isfile(output_filename):\n",
    "    print(input_filename, \" has been already decoded and \", output_filename, \" have been generated\")\n",
    "else:\n",
    "    \n",
    "    print(\"Start...\")\n",
    "    ofile = H5AIS(output_filename, h5template=h5template_yaml, MAX_NUMBER_ROWS=MAX_NUMBER_ROWS)\n",
    "    nb_msg = 0\n",
    "    for msg in FileReaderStream(input_filename):\n",
    "        ofile.decode_message(msg)\n",
    "        nb_msg += 1\n",
    "        if len(ofile.buffers[0]) >= ofile.MAX_NUMBER_ROWS:\n",
    "            print(\"Writing buffer to \", output_filename)\n",
    "            nb_msg = 0\n",
    "            ofile.write()\n",
    "            ofile.close()\n",
    "            list_filenames.append(output_filename)\n",
    "            nb_file += 1\n",
    "            output_filename = output_prefix + \"/ais_\" + f\"{nb_file:02}\" + \".h5\"\n",
    "            ofile = H5AIS(output_filename, h5template=h5template_yaml, MAX_NUMBER_ROWS=MAX_NUMBER_ROWS)\n",
    "            \n",
    "    # Need to write last part\n",
    "    print(\"Writing last buffer to \", output_filename)\n",
    "    ofile.write(nb_msg)   \n",
    "    ofile.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "af285bea-1d3f-44cc-9e04-b3114f17af75",
   "metadata": {},
   "source": [
    "## Open decoded AIS data "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23d8dfc0-eb40-4a55-94a8-a3868ec96767",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "dset = vaex.open(os.path.join(output_folder, 'ais_*.h5'))\n",
    "dset"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ae42e6f9-01f2-4aec-a5ff-7adb181911b6",
   "metadata": {},
   "source": [
    "## Visualization\n",
    "- Simple interactive visualization with plotly to visualize vessels' locations;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7d0ec7bc-4622-4683-9e11-81a79737060f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = dset.to_pandas_df()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e51d14f-913d-474d-9fa0-062a8e1369e4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# visualization\n",
    "import plotly.express as px"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "875034b9-88fd-429b-89bd-1c5d5c02b069",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.scatter_mapbox(df, lat=\"lat\", lon=\"lon\", color=\"mmsi\",  zoom=1, height=500)\n",
    "fig.update_layout(\n",
    "    mapbox_style=\"white-bg\",\n",
    "    mapbox_layers=[\n",
    "        {\n",
    "            \"below\": 'traces',\n",
    "            \"sourcetype\": \"raster\",\n",
    "            \"sourceattribution\": \"United States Geological Survey\",\n",
    "            \"source\": [\n",
    "                \"https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}\"\n",
    "            ]\n",
    "        }\n",
    "      ])\n",
    "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f59a6cd-f296-4d31-8aa1-f392872da053",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}