# Neural network training example

In [1]:
import sys
!{sys.executable} -m pip install "torch>=1.10" --index-url https://download.pytorch.org/whl/cu118
!{sys.executable} -m pip install cesnet-datazoo cesnet-models tqdm

Prepare data transformations for the model.

In [2]:
from cesnet_models.transforms import ClipAndScaleFlowstats, ClipAndScalePPI, NormalizeHistograms, ScalerEnum

ppi_transform = ClipAndScalePPI(psizes_scaler_enum=ScalerEnum.STANDARD,
                                ipt_scaler_enum=ScalerEnum.STANDARD,)
flowstats_transform = ClipAndScaleFlowstats(flowstats_scaler_enum=ScalerEnum.ROBUST, quantile_clip=0.99)
packet_histograms_transform = NormalizeHistograms()

Initialize the dataset class and prepare its configuration.

* Define train and test periods from which the train and test sets will be built
* Split the train set - use 20% of its samples as the validation set
* We use all available applications for a closed-world classification task
* Set data transforms

In [3]:
import logging
from cesnet_datazoo.config import AppSelection, DatasetConfig, ValidationApproach
from cesnet_datazoo.datasets import CESNET_QUIC22

logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(name)s][%(levelname)s] - %(message)s")

DATASET_SIZE = "XS"
dataset = CESNET_QUIC22(data_root="data/CESNET-QUIC22", size=DATASET_SIZE)

dataset_config = DatasetConfig(
    dataset=dataset,
    train_period_name="W-2022-46",
    test_period_name="W-2022-47",
    # train_size=500_000, # Uncomment to limit the number of training samples to speed up this example
    val_approach=ValidationApproach.SPLIT_FROM_TRAIN,
    train_val_split_fraction=0.2,
    apps_selection=AppSelection.ALL_KNOWN,
    return_tensors=True,
    use_packet_histograms=True,
    ppi_transform=ppi_transform,
    flowstats_transform=flowstats_transform,
    flowstats_phist_transform=packet_histograms_transform,)

dataset.set_dataset_config_and_initialize(dataset_config)

[2024-04-08 17:40:19,224][cesnet_datazoo.pytables_data.indices_setup][INFO] - Processing train indices
[2024-04-08 17:40:19,774][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221114 took 0.51 seconds
[2024-04-08 17:40:20,281][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221115 took 0.51 seconds
[2024-04-08 17:40:20,696][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221116 took 0.42 seconds
[2024-04-08 17:40:20,870][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221117 took 0.17 seconds
[2024-04-08 17:40:21,101][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221118 took 0.23 seconds
[2024-04-08 17:40:21,236][cesnet_datazoo.pytables_data.pytables_dataset][INFO] - Reading app column for table /flows/D20221119 took 0.13 seconds
[2024-04-08 17:40:21,413][c

Show dataset classes in the current configuration, together with train, validation, and test counts.

In [4]:
dataset.known_app_counts.sort_values(by="Train", ascending=False)

Unnamed: 0,Train,Validation,Test
google-www,121836,30459,205010
google-ads,116419,29105,195979
google-services,109998,27499,177295
google-play,97905,24476,161546
google-gstatic,92789,23197,150633
...,...,...,...
toggl,150,37,247
ebay-kleinanzeigen,150,38,176
alza-identity,130,32,215
bitdefender-nimbus,118,29,204


Reuse neural network architecture from the `cesnet-models` package without using pre-trained weights, i.e., start the training from scratch.

In [5]:
from cesnet_models.models import mm_cesnet_v2

model = mm_cesnet_v2(weights=None, num_classes=dataset.get_num_classes(), ppi_input_channels=len(dataset_config.get_ppi_channels()), flowstats_input_size=dataset_config.get_flowstats_features_len())
print(model)

Multimodal_CESNET(
  (cnn_ppi): Sequential(
    (0): Conv1d(3, 200, kernel_size=(7,), stride=(1,), padding=(3,))
    (1): ReLU()
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Sequential(
      (0): Conv1d(200, 200, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (4): Sequential(
      (0): Conv1d(200, 200, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (5): Sequential(
      (0): Conv1d(200, 200, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (6): Conv1d(200, 300, kernel_size=(5,), stride=(1,))
    (7): ReLU()
    (8): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_

## Training loop
Train the model with a standard training loop using the cross-entropy loss, the AdamW optimizer, and the OneCycleLR learning scheduler.

The number of epochs is set to five, and the model is validated after each epoch.

In [6]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

def train_one_epoch(model: nn.Module, train_dataloader: DataLoader, optimizer: optim.Optimizer, scheduler, loss_fn, device) -> None:
    model.train()
    for  _, batch_ppi, batch_flowstats, batch_labels in train_dataloader:
        batch_ppi, batch_flowstats, batch_labels = batch_ppi.to(device), batch_flowstats.to(device), batch_labels.to(device)
        optimizer.zero_grad()
        out = model((batch_ppi, batch_flowstats))
        loss = loss_fn(out, batch_labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

def test(model: nn.Module, dataloader: DataLoader, device, progress: bool = False) -> float:
    model.eval()
    true_labels = []
    preds = []
    with torch.no_grad():
        for __, batch_ppi, batch_flowstats, batch_labels in tqdm(dataloader, total=len(dataloader), disable=not progress):
            batch_ppi, batch_flowstats, batch_labels = batch_ppi.to(device), batch_flowstats.to(device), batch_labels.to(device)
            out = model((batch_ppi, batch_flowstats))
            batch_preds = out.argmax(dim=1)
            true_labels.append(batch_labels)
            preds.append(batch_preds)
    true_labels, preds = torch.cat(true_labels).cpu().numpy(), torch.cat(preds).cpu().numpy()
    return (true_labels == preds).mean()

EPOCHS = 5
train_dataloader = dataset.get_train_dataloader()
val_dataloader = dataset.get_val_dataloader()
optimizer = optim.AdamW(model.parameters())
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=1e-3, steps_per_epoch=len(train_dataloader), epochs=EPOCHS)
loss_fn = nn.CrossEntropyLoss()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for i in tqdm(range(1, EPOCHS + 1), total=EPOCHS, file=sys.stdout):
    train_one_epoch(model, train_dataloader, optimizer, scheduler, loss_fn, device)
    validation_accuracy = test(model, val_dataloader, device)
    tqdm.write(f"Epoch {i}, validation accuracy: {validation_accuracy:.4f}")


Epoch 1, validation accuracy: 0.8280 
Epoch 2, validation accuracy: 0.8897          
Epoch 3, validation accuracy: 0.8751          
Epoch 4, validation accuracy: 0.9321          
Epoch 5, validation accuracy: 0.9413          
100%|██████████| 5/5 [09:41<00:00, 116.35s/it]


Evaluate the trained model on the test set.

In [8]:
test_dataloader = dataset.get_test_dataloader()
print("Computing model predictions on the test set.")
test_accuracy = test(model, test_dataloader, device, progress=True)
print(f"The trained model achieved an accuracy of {test_accuracy:.5f} on the test period {dataset_config.test_period_name} of the {dataset.name} dataset.")

Computing model predictions on the test set.


100%|██████████| 1289/1289 [00:38<00:00, 33.07it/s]

The trained model achieved an accuracy of 0.93962 on the test period W-2022-47 of the CESNET-QUIC22-XS dataset.



