{ "cells": [ { "cell_type": "markdown", "id": "c616b990-dd1a-422e-89e2-3860b859d64d", "metadata": {}, "source": [ "# Decode AIS raw data and save to HDF5" ] }, { "cell_type": "code", "execution_count": null, "id": "49d38792-6bd0-4a77-a222-c29ea442d99a", "metadata": {}, "outputs": [], "source": [ "from pyais.stream import FileReaderStream\n", "from pathlib import Path\n", "import os\n", "\n", "import h5py\n", "import vaex\n", "\n", "import numpy as np\n", "\n", "import yaml\n", "\n", "import pooch" ] }, { "cell_type": "code", "execution_count": null, "id": "715d6631-fc78-4b62-b1ba-3d6d56464455", "metadata": {}, "outputs": [], "source": [ "vaex.multithreading.thread_count_default = 4" ] }, { "cell_type": "markdown", "id": "a8d74654-6eac-4e1b-a4c5-b282778edcc8", "metadata": {}, "source": [ "## Define constants" ] }, { "cell_type": "code", "execution_count": null, "id": "84f0e06a-87df-4551-9a40-1bb65c103ee3", "metadata": {}, "outputs": [], "source": [ "# Convert to meters from radians\n", "EARTH_RADIUS_IN_M: int = 6371000" ] }, { "cell_type": "code", "execution_count": null, "id": "7426b74a-1df5-4a26-abb3-327ab09ed542", "metadata": {}, "outputs": [], "source": [ "MAX_NUMBER_ROWS = 10000000" ] }, { "cell_type": "markdown", "id": "9aae97f3-ce3f-424e-8f1d-3bbc0b9daa3a", "metadata": {}, "source": [ "## Define paths for input and output files" ] }, { "cell_type": "code", "execution_count": null, "id": "92bdeff8-413a-42bb-8ff4-9dc340a4b3bf", "metadata": {}, "outputs": [], "source": [ "data_folder = \"./data\"\n", "input_folder = os.path.join(data_folder, \"input\")\n", "output_folder = os.path.join(data_folder, \"decoded\")" ] }, { "cell_type": "markdown", "id": "a6e86187-5c90-4091-aa04-86a641bb10e9", "metadata": {}, "source": [ "### Create folders" ] }, { "cell_type": "code", "execution_count": null, "id": "e25ec3b5-20aa-4e5b-ae54-87b455b42e19", "metadata": {}, "outputs": [], "source": [ "for folder in [data_folder, input_folder, output_folder]:\n", " if not os.path.isdir(folder):\n", " os.mkdir(folder)" ] }, { "cell_type": "markdown", "id": "c7a91979-8509-466a-b4f6-37a1f3020a60", "metadata": {}, "source": [ "## Get input data" ] }, { "cell_type": "code", "execution_count": null, "id": "541cdcc8-3775-489c-acd4-72e6700e121a", "metadata": {}, "outputs": [], "source": [ "input_url = \"https://raw.githubusercontent.com/aduvenhage/ais-decoder/master/data/nmea-sample.txt\"" ] }, { "cell_type": "code", "execution_count": null, "id": "3a1508e3-5b0c-4d8d-9ff4-f172988b11b9", "metadata": {}, "outputs": [], "source": [ "pooch.retrieve(\n", " url=input_url,\n", " known_hash=None,\n", " path=input_folder,\n", " fname=input_url.split(\"/\")[-1]\n", ")" ] }, { "cell_type": "markdown", "id": "73c8335e-a174-4706-b3b9-277a760834f4", "metadata": {}, "source": [ "## Define column names, types and attributes\n", "\n", "- Read from XML template file containing information about data group, variables and their associated attributes such as types, missing values, bounds, etc." ] }, { "cell_type": "code", "execution_count": null, "id": "dc1abca0-7421-404d-b15b-92a49ce43912", "metadata": { "tags": [] }, "outputs": [], "source": [ "# Load parameters\n", "h5template_yaml: Path = \"./ais_metadata.yaml\"" ] }, { "cell_type": "markdown", "id": "4745e8c9-b187-44aa-b463-58dc719c15bb", "metadata": {}, "source": [ "## Analyse, decode and save to HDF5" ] }, { "cell_type": "code", "execution_count": null, "id": "5515ee77-5f3c-460c-b85e-a9bdc6e4ee4a", "metadata": {}, "outputs": [], "source": [ "class H5AIS:\n", " def __init__(self, ofilename, h5template=None, MAX_NUMBER_ROWS = 2000000):\n", " self.output_filename = ofilename\n", " self.columns = []\n", " self.coltypes = []\n", " self.colmissings = []\n", " self.colsizes = []\n", " self.colattrs = []\n", " self.buffers = []\n", " self.h5file = h5py.File(ofilename, \"w\")\n", " if h5template is None:\n", " self.h5group = \"data\"\n", " else:\n", " self.get_from_template(h5template)\n", " self.MAX_NUMBER_ROWS = MAX_NUMBER_ROWS\n", " self.h5columns = self.h5file.create_group(self.h5group) # vaex reads all datasets in the columns group\n", " \n", " \n", " def get_from_template(self, h5template):\n", " yaml_file = open(f\"{h5template}\")\n", " data_info = yaml.load(yaml_file, Loader=yaml.FullLoader)[\"group\"]\n", " self.h5group = data_info[\"group_name\"]\n", " self.h5template = data_info\n", " i = 0\n", " for var in self.h5template[\"variables\"].keys():\n", " v = self.h5template[\"variables\"][var]\n", " self.columns.append(var)\n", " attrs = { } \n", " for vk in v.keys():\n", " if \"type\" == vk:\n", " self.coltypes.append(v[vk])\n", " elif \"missing_value\" == vk:\n", " self.colmissings.append(v[vk])\n", " else:\n", " attrs[vk] = v[vk]\n", " self.colattrs.append(attrs)\n", " self.buffers.append([])\n", " i+=1\n", " \n", " def _key2value(self, msg, key):\n", " idx = self.columns.index(key)\n", " attrs = self.colattrs[idx]\n", " if key == \"status\" and \"status\" in msg.asdict():\n", " colval = msg.asdict()[key].value\n", " elif key in msg.asdict():\n", " if \"flag_meanings\" in attrs:\n", " if type(msg.asdict()[key]) == bool:\n", " colval = int(msg.asdict()[key])\n", " else:\n", " idx_flag = attrs[\"flag_meanings\"].index(msg.asdict()[key])\n", " colval = attrs[\"flag_values\"][idx_flag]\n", " elif self.coltypes[idx] == \"str\":\n", " colval = str(msg.asdict()[key])\n", " else:\n", " colval = msg.asdict()[key]\n", " else:\n", " colval = self.colmissings[idx]\n", " \n", " if \"scaling_factor\" in self.colattrs[idx]:\n", " scaling = attrs[\"scaling_factor\"]\n", " colval = scaling * colval\n", " return colval\n", " \n", " def create_column(self, colname, coltype, colmissing, colsize, colattr):\n", " idx = len(self.columns)\n", " self.columns.append(colname)\n", " self.coltypes[colname] = coltype\n", " self.colmissings[colname] = colmissing\n", " self.colsizes.append(colsize)\n", " self.colattrs.append(colattr)\n", " self.buffers.append([])\n", " \n", " def create_columns(self, colnames, coltypes, colmissings, colsizes, colattrs):\n", " print(coltypes)\n", " for cname, ctype, cmiss, csize, cattr in zip(colnames, list(coltypes.values()), list(colmissings.values()), colsizes, colattrs):\n", " self.create_column(cname, ctype, cmiss, csize, cattr)\n", " \n", " def decode_message(self, msg):\n", " try:\n", " decoded_message = msg.decode()\n", " if msg.tag_block is not None:\n", " msg.tag_block.init()\n", " tags = msg.tag_block\n", " except:\n", " print(\" Message \", msg, \" ignored\")\n", " else:\n", " for cname in self.columns:\n", " idx = self.columns.index(cname)\n", " if \"tag_block\" in self.colattrs[idx]:\n", " if msg.tag_block is not None:\n", " val = self._key2value(tags, cname)\n", " else:\n", " val = self.colmissings[idx]\n", " else:\n", " val = self._key2value(decoded_message, cname)\n", " self.buffers[idx].append(val) \n", " \n", " \n", " def write(self, colsize=None):\n", " for idx, colname in zip(range(len(self.columns)), self.columns):\n", " if colsize is None:\n", " colsize = self.MAX_NUMBER_ROWS\n", " coltype = self.coltypes[idx]\n", " print(\"Writing Column \", colname, coltype, self.colmissings[idx])\n", " if coltype == \"str\": \n", " coltype = h5py.string_dtype(encoding='utf-8')\n", " ds = self.h5columns.create_dataset(colname, data=np.array(self.buffers[idx], dtype='S'))\n", " else:\n", " ds = self.h5columns.create_dataset(colname, data=np.array(self.buffers[idx], dtype=coltype), dtype=coltype)\n", " attrs = self.colattrs[idx]\n", " for attr in self.colattrs[idx].keys():\n", " ds.attrs[attr] = attrs[attr]\n", "\n", " def close(self):\n", " self.h5file.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "90d83047-1a77-4f5a-86d0-07aff7963b67", "metadata": { "tags": [] }, "outputs": [], "source": [ "%%time\n", "\n", "nb_file = 1\n", "\n", "input_filename = os.path.join(input_folder, input_url.split(\"/\")[-1])\n", "output_filename = os.path.join(output_folder, \"ais_\" + f\"{nb_file:02}\" + \".h5\")\n", "\n", "list_filenames = []\n", "if os.path.isfile(output_filename):\n", " print(input_filename, \" has been already decoded and \", output_filename, \" have been generated\")\n", "else:\n", " \n", " print(\"Start...\")\n", " ofile = H5AIS(output_filename, h5template=h5template_yaml, MAX_NUMBER_ROWS=MAX_NUMBER_ROWS)\n", " nb_msg = 0\n", " for msg in FileReaderStream(input_filename):\n", " ofile.decode_message(msg)\n", " nb_msg += 1\n", " if len(ofile.buffers[0]) >= ofile.MAX_NUMBER_ROWS:\n", " print(\"Writing buffer to \", output_filename)\n", " nb_msg = 0\n", " ofile.write()\n", " ofile.close()\n", " list_filenames.append(output_filename)\n", " nb_file += 1\n", " output_filename = output_prefix + \"/ais_\" + f\"{nb_file:02}\" + \".h5\"\n", " ofile = H5AIS(output_filename, h5template=h5template_yaml, MAX_NUMBER_ROWS=MAX_NUMBER_ROWS)\n", " \n", " # Need to write last part\n", " print(\"Writing last buffer to \", output_filename)\n", " ofile.write(nb_msg) \n", " ofile.close()" ] }, { "cell_type": "markdown", "id": "af285bea-1d3f-44cc-9e04-b3114f17af75", "metadata": {}, "source": [ "## Open decoded AIS data " ] }, { "cell_type": "code", "execution_count": null, "id": "23d8dfc0-eb40-4a55-94a8-a3868ec96767", "metadata": { "tags": [] }, "outputs": [], "source": [ "dset = vaex.open(os.path.join(output_folder, 'ais_*.h5'))\n", "dset" ] }, { "cell_type": "markdown", "id": "ae42e6f9-01f2-4aec-a5ff-7adb181911b6", "metadata": {}, "source": [ "## Visualization\n", "- Simple interactive visualization with plotly to visualize vessels' locations;" ] }, { "cell_type": "code", "execution_count": null, "id": "7d0ec7bc-4622-4683-9e11-81a79737060f", "metadata": {}, "outputs": [], "source": [ "df = dset.to_pandas_df()" ] }, { "cell_type": "code", "execution_count": null, "id": "3e51d14f-913d-474d-9fa0-062a8e1369e4", "metadata": {}, "outputs": [], "source": [ "# visualization\n", "import plotly.express as px" ] }, { "cell_type": "code", "execution_count": null, "id": "875034b9-88fd-429b-89bd-1c5d5c02b069", "metadata": {}, "outputs": [], "source": [ "fig = px.scatter_mapbox(df, lat=\"lat\", lon=\"lon\", color=\"mmsi\", zoom=1, height=500)\n", "fig.update_layout(\n", " mapbox_style=\"white-bg\",\n", " mapbox_layers=[\n", " {\n", " \"below\": 'traces',\n", " \"sourcetype\": \"raster\",\n", " \"sourceattribution\": \"United States Geological Survey\",\n", " \"source\": [\n", " \"https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}\"\n", " ]\n", " }\n", " ])\n", "fig.update_layout(margin={\"r\":0,\"t\":0,\"l\":0,\"b\":0})\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "3f59a6cd-f296-4d31-8aa1-f392872da053", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.8" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 5 }