{ "cells": [ { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import cv2\n", "import os\n", "from tqdm import tqdm,trange\n", "import sklearn.metrics\n", "\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "from pytorchcv.model_provider import get_model as ptcv_get_model\n", "\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!mkdir api_dir" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Initializing a project" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Project name: deepfake-shield-mayukh\n", "Artifacts path: /media/mayukh/Data/storage/repositories/repos/deep-shield-temp/notebooks/artifact_dir\n" ] } ], "source": [ "from os import path\n", "import mlrun\n", "\n", "# Set the base project name\n", "project_name_base = 'deepfake-shield'\n", "# Initialize the MLRun environment and save the project name and artifacts path\n", "project_name, artifact_path = mlrun.set_environment(project=project_name_base,\n", " user_project=True, api_path = './api_dir', artifact_path = './artifact_dir')\n", " \n", "# Display the current project name and artifacts path\n", "print(f'Project name: {project_name}')\n", "print(f'Artifacts path: {artifact_path}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## First, let's load up our training data \n", "\n", "We've used a [modified version](https://www.kaggle.com/unkownhihi/deepfake) of the [deepfake-detection-challenge](https://www.kaggle.com/c/deepfake-detection-challenge) dataset from kaggle which was a part of a competition over a year ago. \n", "\n", "This dataset aims to help train models which can determine whether a given face is a deepfake (`1`) or not (`0`). \n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "filenames = pd.read_csv('filenames_train.csv').filenames.values\n", "filenames_val = pd.read_csv('filenames_val.csv').filenames.values" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 47/47 [00:17<00:00, 2.72it/s]\n", "100%|██████████| 3/3 [00:01<00:00, 2.56it/s]\n" ] } ], "source": [ "df_trains = [pd.read_json(filenames[i]) for i in tqdm(range(len(filenames)))]\n", "df_vals = [pd.read_json(filenames_val[i]) for i in tqdm(range(len(filenames_val)))]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "tags": [] }, "outputs": [], "source": [ "nums = list(range(len(df_trains)+1))\n", "LABELS = ['REAL','FAKE']\n", "val_nums=[47, 48, 49]\n", "\n", "def get_path(num,x):\n", " num=str(num)\n", " if len(num)==2:\n", " path='training_data/archive/DeepFake'+num+'/DeepFake'+num+'/' + x.replace('.mp4', '') + '.jpg'\n", " else:\n", " path='training_data/archive/DeepFake0'+num+'/DeepFake0'+num+'/' + x.replace('.mp4', '') + '.jpg'\n", " if not os.path.exists(path):\n", " raise Exception\n", " return path" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 47/47 [00:34<00:00, 1.34it/s]\n" ] } ], "source": [ "paths=[]\n", "y=[]\n", "for df_train,num in tqdm(zip(df_trains,nums),total=len(df_trains)):\n", " images = list(df_train.columns.values)\n", " for x in images:\n", " try:\n", " paths.append(get_path(num,x))\n", " y.append(LABELS.index(df_train[x]['label']))\n", " except Exception as err:\n", " #print(err)\n", " pass" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 3/3 [00:01<00:00, 2.73it/s]\n" ] } ], "source": [ "val_paths=[]\n", "val_y=[]\n", "for df_val,num in tqdm(zip(df_vals,val_nums),total=len(df_vals)):\n", " images = list(df_val.columns.values)\n", " for x in images:\n", " try:\n", " # print(x)\n", " val_paths.append(get_path(num,x))\n", " val_y.append(LABELS.index(df_val[x]['label']))\n", " except Exception as err:\n", " #print(err)\n", " pass\n" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "df_train = pd.DataFrame({\n", " 'paths': paths,\n", " 'labels': y,\n", "})\n", "\n", "df_val = pd.DataFrame({\n", " 'paths': val_paths,\n", " 'labels': val_y\n", "})\n", " \n", "df_train.to_csv('df_train.csv', index = False)\n", "df_val.to_csv('df_val.csv', index = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Preparing training data with `mlrun`\n", "\n", "As seen below, the number of images belonging to class `FAKE` is much higher than that of `REAL`. But a quick way to fix that would be to use another famouse face dataset to help increase the number of `REAL` images" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# mlrun: start-code" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "from mlrun.artifacts import PlotArtifact\n", "\n", "# https://github.com/mlrun/mlrun/blob/2f707f068f058794f2cbec2e802766a09e483e91/mlrun/datastore/base.py#L219\n", "from mlrun.datastore import DataItem \n", "\n", "def prep_data(context, source_train, source_val):\n", " \n", " '''\n", " prep training data\n", " '''\n", " \n", " # Convert the DataItem to a pandas DataFrame\n", " df_train = source_train.as_df() \n", " df_train['labels'] = df_train['labels'].astype('category').cat.codes \n", "\n", " # Record the DataFrane length after the run\n", " context.log_result('num_rows_in_training_set', df_train.shape[0])\n", " context.log_dataset('deepfake_dataset_train', df=df_train, format='csv', index=False)\n", " \n", " '''\n", " prep validation data\n", " '''\n", " \n", " # Convert the DataItem to a pandas DataFrame\n", " df_val = source_val.as_df() \n", " df_val['labels'] = df_val['labels'].astype('category').cat.codes \n", "\n", " # Record the DataFrane length after the run\n", " context.log_result('num_rows_in_validation_set', df_val.shape[0])\n", " context.log_dataset('deepfake_dataset_val', df=df_val, format='csv', index=False)\n", " \n", " '''\n", " prep data vis\n", " '''\n", " \n", " fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (8,4))\n", " fig.suptitle('Visualizing training + validation data')\n", " \n", " unique, counts = np.unique(df_train['labels'].values, return_counts=True)\n", " \n", " ax[0].bar(['REAL', 'FAKE'], counts, alpha = 0.7, color = 'g')\n", " ax[0].grid()\n", " \n", " unique, counts = np.unique(df_val['labels'].values, return_counts=True)\n", " ax[1].bar(['REAL', 'FAKE'], counts, alpha = 0.7, color = 'y')\n", " ax[1].grid()\n", " \n", " context.log_artifact(PlotArtifact('data_vis', body=fig))\n", " \n", " print('complete :)')" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# mlrun: end-code" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "# Convert the local prep_data function to an MLRun project function\n", "data_prep_func = mlrun.code_to_function(\n", " name='prep_data', \n", " kind='job', \n", " image='mlrun/mlrun'\n", ")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "config = {\n", " 'source_train': 'df_train.csv',\n", " 'source_val': 'df_val.csv'\n", "}" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "> 2021-07-03 12:23:10,420 [info] starting run prep_data uid=c59753f84f774967b99652a9dc13eb86 DB=./api_dir\n", "complete :)\n" ] }, { "data": { "text/html": [ "\n", "
project | \n", "uid | \n", "iter | \n", "start | \n", "state | \n", "name | \n", "labels | \n", "inputs | \n", "parameters | \n", "results | \n", "artifacts | \n", "
---|---|---|---|---|---|---|---|---|---|---|
deepfake-shield-mayukh | \n", "...13eb86 | \n",
" 0 | \n", "Jul 03 06:53:10 | \n", "completed | \n", "prep_data | \n", "kind= owner=mayukh host=leopard | \n",
" source_train source_val | \n",
" \n", " | num_rows_in_training_set=104890 num_rows_in_validation_set=7366 | \n",
" deepfake_dataset_train deepfake_dataset_val data_vis | \n",
"
project | \n", "uid | \n", "iter | \n", "start | \n", "state | \n", "name | \n", "labels | \n", "inputs | \n", "parameters | \n", "results | \n", "artifacts | \n", "
---|---|---|---|---|---|---|---|---|---|---|
deepfake-shield-mayukh | \n", "...cfc762 | \n",
" 0 | \n", "Jul 03 08:09:05 | \n", "completed | \n", "deep-shield-grid-search-hyperparams | \n", "kind=handler owner=mayukh | \n",
" \n", " | \n", " | best_iteration=6 loss=7.21016263961792 | \n",
" iteration_results | \n",
"