{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Neural network training example" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sys\n", "!{sys.executable} -m pip install \"torch>=1.10\" --index-url https://download.pytorch.org/whl/cu118\n", "!{sys.executable} -m pip install cesnet-datazoo cesnet-models tqdm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Prepare data transformations for the model." ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI, NormalizeHistograms, ScalerEnum\n", "\n", "ppi_transform = ClipAndScalePPI(psizes_scaler_enum=ScalerEnum.STANDARD,\n", " ipt_scaler_enum=ScalerEnum.STANDARD,)\n", "flowstats_transform = ClipAndScaleFlowstats(flowstats_scaler_enum=ScalerEnum.ROBUST, quantile_clip=0.99)\n", "packet_histograms_transform = NormalizeHistograms()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Initialize the dataset class and prepare its configuration.\n", "\n", "* Define train and test periods from which the train and test sets will be built\n", "* Split the train set - use 20% of its samples as the validation set\n", "* We use all available applications for a closed-world classification task\n", "* Set data transforms" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[2024-04-08 17:40:19,224][cesnet_datazoo.pytables_data.indices_setup][INFO] - Processing train indices\n", "[2024-04-08 17:40:19,774][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221114 took 0.51 seconds\n", "[2024-04-08 17:40:20,281][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221115 took 0.51 seconds\n", "[2024-04-08 17:40:20,696][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221116 took 0.42 seconds\n", "[2024-04-08 17:40:20,870][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221117 took 0.17 seconds\n", "[2024-04-08 17:40:21,101][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221118 took 0.23 seconds\n", "[2024-04-08 17:40:21,236][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221119 took 0.13 seconds\n", "[2024-04-08 17:40:21,413][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221120 took 0.18 seconds\n", "[2024-04-08 17:40:21,431][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Found applications with less than 100 train samples: ['livescore']. Disabling these applications\n", "[2024-04-08 17:40:21,442][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Selected 101 known applications and 0 unknown applications\n", "[2024-04-08 17:40:23,261][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Processing indices took 1.85 seconds\n", "[2024-04-08 17:40:27,834][cesnet_datazoo.pytables_data.data_scalers][INFO] - Reading data and fitting scalers took 3.68 seconds\n" ] } ], "source": [ "import logging\n", "from cesnet_datazoo.config import AppSelection, DatasetConfig, ValidationApproach\n", "from cesnet_datazoo.datasets import CESNET_QUIC22\n", "\n", "logging.basicConfig(\n", " level=logging.INFO,\n", " format=\"[%(asctime)s][%(name)s][%(levelname)s] - %(message)s\")\n", "\n", "DATASET_SIZE = \"XS\"\n", "dataset = CESNET_QUIC22(data_root=\"data/CESNET-QUIC22\", size=DATASET_SIZE)\n", "\n", "dataset_config = DatasetConfig(\n", " dataset=dataset,\n", " train_period_name=\"W-2022-46\",\n", " test_period_name=\"W-2022-47\",\n", " # train_size=500_000, # Uncomment to limit the number of training samples to speed up this example\n", " val_approach=ValidationApproach.SPLIT_FROM_TRAIN,\n", " train_val_split_fraction=0.2,\n", " apps_selection=AppSelection.ALL_KNOWN,\n", " return_tensors=True,\n", " use_packet_histograms=True,\n", " ppi_transform=ppi_transform,\n", " flowstats_transform=flowstats_transform,\n", " flowstats_phist_transform=packet_histograms_transform,)\n", "\n", "dataset.set_dataset_config_and_initialize(dataset_config)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Show dataset classes in the current configuration, together with train, validation, and test counts." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Train | \n", "Validation | \n", "Test | \n", "
---|---|---|---|
google-www | \n", "121836 | \n", "30459 | \n", "205010 | \n", "
google-ads | \n", "116419 | \n", "29105 | \n", "195979 | \n", "
google-services | \n", "109998 | \n", "27499 | \n", "177295 | \n", "
google-play | \n", "97905 | \n", "24476 | \n", "161546 | \n", "
google-gstatic | \n", "92789 | \n", "23197 | \n", "150633 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
toggl | \n", "150 | \n", "37 | \n", "247 | \n", "
ebay-kleinanzeigen | \n", "150 | \n", "38 | \n", "176 | \n", "
alza-identity | \n", "130 | \n", "32 | \n", "215 | \n", "
bitdefender-nimbus | \n", "118 | \n", "29 | \n", "204 | \n", "
uber | \n", "87 | \n", "22 | \n", "118 | \n", "
101 rows × 3 columns
\n", "