{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import os\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from moviepy.editor import VideoFileClip, concatenate_videoclips"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Videos_raw\"\n",
    "clip1 = VideoFileClip(os.path.join(base_dir,\"1005b_part1.mp4\"))\n",
    "clip2 = VideoFileClip(os.path.join(base_dir,\"1005b_part2.mp4\"))\n",
    "final_clip = concatenate_videoclips([clip1,clip2])\n",
    "final_clip.write_videofile(\"1005b.mp4\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_file = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\\1105e\\FullDataset\\Final_data_1105e.pkl\"\n",
    "data = pd.read_pickle(data_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def windows(data, size,factor=2):\n",
    "    start = 0\n",
    "    while start + (size / factor) < len(data):\n",
    "        yield int(start), int(start + size)\n",
    "        start += (size / factor)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def to_seconds(time,factor = 1000):\n",
    "    seconds = time/factor\n",
    "    return round(seconds,2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pictures_dataset(data,user,window_size,factor =1):\n",
    "    df_pictures = {\"id\":[],\"target\":[],\"start\":[],\"end\":[]}\n",
    " \n",
    "    for idx,(start, end) in enumerate(windows(data.index, window_size,factor)):\n",
    "        df_pictures[\"id\"].append(f\"{user}_{idx}\")\n",
    "        df_pictures[\"target\"].append(data.loc[start:end,\"picture\"].mode()[0])\n",
    "        df_pictures[\"start\"].append(to_seconds(data.loc[start,\"Recording timestamp\"]))\n",
    "        df_pictures[\"end\"].append(to_seconds(data.loc[end,\"Recording timestamp\"]))\n",
    "        \n",
    "    return pd.DataFrame.from_dict(df_pictures).set_index(\"id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "user = \"1105e\"\n",
    "test = get_pictures_dataset(data,user,500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_nulls(df):\n",
    "    #get samples of majoritary class different than null\n",
    "    samples_max=max(test[test[\"target\"]!=\"Null\"]\n",
    "                    .target \\\n",
    "                    .value_counts() ) \n",
    "    #get id for null and rest of labels\n",
    "    null_idx=df[df[\"target\"]==\"Null\"].index\n",
    "    samples_null = len(null_idx)\n",
    "    if(samples_null>samples_max):\n",
    "        #perform random undersampling\n",
    "        rest_idx = df[df[\"target\"]!=\"Null\"].index\n",
    "        # sample a subset of id from null samples to match majoritary class samples\n",
    "        selected_idx = np.random.choice(range(samples_null), size=samples_max, replace=False)\n",
    "        null_idx=null_idx[selected_idx]\n",
    "        #filter df \n",
    "        df = df.loc[null_idx|rest_idx]\n",
    "\n",
    "    return df\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df= filter_nulls(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Video"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from moviepy.editor import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_video = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\Videos_raw\\1105e.mp4\"\n",
    "video = VideoFileClip(path_video).resize((224,224))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "video_dataset = {\n",
    "    \"video_path\": [],\n",
    "    \"label\": [],\n",
    "    \"frames\": []\n",
    "}\n",
    "root_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\"\n",
    "video_folder = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Participants\\1105e\\Clips\"\n",
    "participant =\"1105e\"\n",
    "if not(os.path.isdir(video_folder)):\n",
    "    os.makedirs(video_folder)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for idx in tqdm(df.index):\n",
    "    #get clip\n",
    "    clip = video.subclip(df.loc[idx,\"start\"],df.loc[idx,\"end\"])\n",
    "    n_frames  = clip.duration * clip.fps\n",
    "    label  = df.loc[idx,\"target\"]\n",
    "    # Write the result to a file \n",
    "    clip_id = df.loc[idx,\"id\"]\n",
    "    video_rel_path = os.path.join(participant,\"Clips\",f\"{clip_id}.mp4\")\n",
    "    clip_file= os.path.join(root_path,video_rel_path)\n",
    "    clip.write_videofile(clip_file,audio=False,logger = None)\n",
    "    #update dataset\n",
    "    video_dataset[\"video_path\"].append(video_rel_path)\n",
    "    video_dataset[\"label\"].append(label)\n",
    "    video_dataset[\"frames\"].append(n_frames)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "video_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "media_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\ProjectData\\Media\"\n",
    "participants_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\ProjectData\\Participants\"\n",
    "recordings_path = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\ProjectData\\Recordings\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import xml.etree.ElementTree as ET\n",
    "def get_media_keys(xml_file):\n",
    "    tree = ET.parse(xml_file)\n",
    "    root = tree.getroot()\n",
    "    key = root.find(\"Key\").text\n",
    "    filename = root.find(\"TargetFileName\").text\n",
    "    return key,filename\n",
    "\n",
    "def get_participant_keys(xml_file):\n",
    "    tree = ET.parse(xml_file)\n",
    "    root = tree.getroot()\n",
    "    key = root.find(\"Key\").text\n",
    "    participant = root.find(\"Name\").text\n",
    "    return key,participant\n",
    "\n",
    "def get_recording_keys(xml_file):\n",
    "    tree = ET.parse(xml_file)\n",
    "    participant_id = next(elem.text for elem in tree.iter() if \"ParticipantId\" in elem.tag)\n",
    "    media_id  = next(elem.text for elem in tree.iter() if \"guid\" in elem.tag)\n",
    "    return participant_id, media_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\ProjectData\\Recordings\\BqQksdKnyUK4EYy9bga5Yg.rec\"\n",
    "tree  = ET.parse(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "dcd22fbf-4ea2-49cf-90da-463d9e49e9ae\n"
     ]
    }
   ],
   "source": [
    "for el in tree.iter():\n",
    "    if(\"guid\" in el.tag):\n",
    "        print(el.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "#define key mappings\n",
    "recording_files =  [os.path.join(recordings_path,file) for file in os.listdir(recordings_path) if \"rec\" in file]\n",
    "recordings = {}\n",
    "for idx,file in enumerate(recording_files):\n",
    "    participant_id, media_id = get_recording_keys(file)\n",
    "    recordings[idx]  = {participant_id: \"\" , media_id: \"\"}\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = recordings[1].keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['2872c273-20e0-4ed3-a3bf-1908a208a760', 'd0dcbb3f-3360-4233-b1e8-9da4a85c3257'])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'2872c273-20e0-4ed3-a3bf-1908a208a760' in x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "for file in os.listdir(participants_path):\n",
    "    file = os.path.join(participants_path,file)\n",
    "    # get participant key\n",
    "    participant_key,participant_name = get_participant_keys(file)\n",
    "    for key in recordings.keys():\n",
    "        if (participant_key in recordings[key].keys()):\n",
    "            recordings[key][participant_key] = participant_name\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "media_files = [os.path.join(media_path,file) for file in os.listdir(media_path)\n",
    "                                              if \"xml\" in file]\n",
    "for file in media_files:\n",
    "    # get participant key\n",
    "    media_key,recording_filename = get_media_keys(file)\n",
    "    for key in recordings.keys():\n",
    "        if (media_key in recordings[key].keys()):\n",
    "            recordings[key][media_key] = recording_filename\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "recordings_metadata_file = r\"C:\\Users\\jeuux\\Desktop\\Carrera\\MoAI\\TFM\\AnnotatedData\\FinalDatasets\\Datasets\\Video_Dataset_meta\\recordings_meta.json\"\n",
    "with open(recordings_metadata_file,\"w\") as f:\n",
    "    json.dump(recordings,f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# participant_keys = [get_hash_root(os.path.join(xml_dir,xml_file)) for xml_file in os.listdir(participants_path)]\n",
    "media_keys = [get_hash_root(os.path.join(media_path,xml_file))\n",
    "                                                  for xml_file in os.listdir(media_path)\n",
    "                                                     if \"xml\" in xml_file]\n",
    "participant_keys = {}\n",
    "\n",
    "for xml_file in os.listdir(participants_path):\n",
    "    if(\"rec\" in xml_file):\n",
    "        key,participant  = get_key_name(os.path.join(participants_path,xml_file))\n",
    "        participant_keys[key] = participant\n",
    "        \n",
    "participant_ids = [get_participant_id(os.path.join(recordings_path,xml_file))for xml_file in os.listdir(recordings_path)\n",
    "                                                  if \"rec\" in xml_file]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hi\n"
     ]
    }
   ],
   "source": [
    "ext_rec_id  = \"b27628c4-2e7f-4e8e-8dd3-d933d165f04e\"\n",
    "key = \"b124a406-a7d2-42c9-b811-8cbd6e06b962\"\n",
    "media_dp1 = \"dcd22fbf-4ea2-49cf-90da-463d9e49e9ae\"\n",
    "for keys in media_keys:\n",
    "    if(media_dp1==keys):\n",
    "        print(\"Hi\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "media_keys.values()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "xml_file = os.path.join(participants_path,xml_file)\n",
    "tree = ET.parse(xml_file)\n",
    "root = tree.getroot()\n",
    "key = root.find(\"Key\").text\n",
    "participant = root.find(\"Name\").text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "key"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for e in elem.iter():\n",
    "    print(e)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tree.getroot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for keys in participant_ids:\n",
    "    try:\n",
    "        next(media_key for media_key in media_keys if media_key==key)\n",
    "        print (\"Match!\")\n",
    "    except:\n",
    "        print(\"there is not any match\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(participant_keys)==len(media_keys)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(test_key in participant_keys)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def find_recording(recordings_path,hash_root):\n",
    "    for file in os.listdir(recordings_path):\n",
    "        if(\"mp4\" in file):\n",
    "            root = file.split(\"==\")[0]\n",
    "            if(root==hash_root):\n",
    "                print(root,hash_root)\n",
    "                return True "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for xml_file in os.listdir(xml_dir):\n",
    "    hash_root = get_hash_root(os.path.join(xml_dir,xml_file))\n",
    "    find_recording(recordings_path,hash_root)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_hash = \"4KpMREZBURFy2YS6VpeYYw\"\n",
    "for file in os.listdir(xml_dir):\n",
    "    print(get_file_root(file) in test_hash)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_file_root(path):\n",
    "    base=os.path.basename(path)\n",
    "    return os.path.splitext(base)[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}