{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# EMT" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generación de los dataset para cada día" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "EMT_TRAIN_PATH = os.path.join(ROOT_PATH, \"data\",'train','emt','2024','03')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset final" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_final_dataset(sample_data):\n", " sample_data = sample_data.with_columns((pl.col('datetime').cast(pl.String)+\"_B\"+pl.col('bus').cast(pl.String)+\"_L\"+ pl.col('line').cast(pl.String)+\"_S\"+pl.col('stop').cast(pl.String)).alias('PK'))\n", " \n", " # ETA <2400\n", " sample_data = sample_data.filter(pl.col('estimateArrive')<888888)\n", " sample_data = sample_data.group_by('PK').min()\n", " \n", " sample_data = sample_data.with_columns(pl.col(\"date\").cast(pl.Date),pl.col('isHead').cast(pl.UInt8))\n", " \n", " sample_data = sample_data.with_columns(pl.col('datetime').map_elements(lambda x: datetime.strptime(x, \"%Y-%m-%d %H:%M:%S.%f\")))\n", " \n", " # Rellenamos valores nulos de dayType\n", " sample_data = sample_data.with_columns(pl.when(pl.col('dayType').is_null()).then(pl.col('date').apply(get_type_day)).otherwise(pl.col('dayType')).alias('dayType'))\n", " \n", " # Eliminamos variables\n", " sample_data = sample_data.drop('positionTypeBus','deviation','MaximumFrequency','StartTime','StopTime','strike')\n", " \n", "\n", " \n", " return sample_data.collect()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset auxiliar" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def calculate_predict_arrival_date(date_datetime,second):\n", " new_date_datetime = date_datetime + timedelta(seconds=second)\n", " \n", " return new_date_datetime" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_interval_time(date_datetime):\n", " hour= int(date_datetime.hour)\n", " minute = int(date_datetime.minute)\n", " \n", " return [hour - 1, hour + 1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_auxiliar_dataset(sample_data):\n", " \n", " sample_data_aux = sample_data.with_columns(pl.struct(datetime = pl.col('datetime'), estimateArrive = pl.col('estimateArrive').alias('struct')).map_elements(lambda x: calculate_predict_arrival_date(x['datetime'], x['estimateArrive'])).alias('predict_arrival_date'))\n", " \n", " sample_data_aux = sample_data_aux.with_columns(pl.col('datetime').apply(get_interval_time).alias('interval_time'))\n", " \n", " small_sample_data = sample_data_aux.filter(pl.col('estimateArrive')<=60).group_by(pl.col('bus'),pl.col('line'),pl.col('stop'),pl.col('destination'),pl.col('date'),pl.col('interval_time')).min().with_columns(pl.col('predict_arrival_date').alias('reliable_arrival_date'))\n", " \n", " final_sample_data = sample_data_aux.join(small_sample_data,on=[pl.col('bus'),pl.col('line'),pl.col('stop'),pl.col('destination'),pl.col('date'),pl.col('interval_time')],how = 'left')\n", " \n", " \n", " final_sample_data = final_sample_data.filter(pl.col('reliable_arrival_date').is_not_null())\n", " \n", " final_sample_data = final_sample_data.select(pl.col('PK'),pl.col('reliable_arrival_date'),pl.col('predict_arrival_date'),pl.col('interval_time'),pl.col('estimateArrive'))\n", " \n", " return final_sample_data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = pl.scan_csv(os.path.join(EMT_DATA_PATH, \"2024\", \"03\", f\"emt_202403.csv\"))\n", "list_day = ['02','03','04','05','06','07','08','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for day in list_day:\n", " print(day)\n", " path_folder = os.path.join(EMT_TRAIN_PATH, day)\n", " path_file = os.path.join(EMT_DATA_PATH,day)\n", " if not os.path.exists(path_folder):\n", " os.mkdir(path_folder)\n", " \n", " sample_data = data.filter(pl.col('date')==f\"2024-03-{day}\")\n", " sample_data = create_final_dataset(sample_data)\n", " sample_data.to_pandas().to_csv(f'{EMT_TRAIN_PATH}/{day}/emt_202403{day}.csv')\n", " \n", " sample_data_aux = create_auxiliar_dataset(sample_data) \n", " sample_data_aux.to_pandas().to_csv(f'{EMT_TRAIN_PATH}/{day}/emt_202403{day}_aux.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Informo" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Dataset final" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "INFORMO_TRAIN_PATH = os.path.join(ROOT_PATH, \"data\",'train','informo','2024','03')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def create_final_dataset(sample_data):\n", " # Preproc previo\n", " sample_data = sample_data.filter(pl.col('idelem').is_not_null(),pl.col('error')=='N').drop('velocidad','error')\n", " \n", " # Variable type\n", " sample_data = sample_data.with_columns(\n", " pl.col('datetime').map_elements(lambda x: datetime.strptime(x, \"%Y-%m-%d %H:%M:%S\")),\n", " pl.col('date').cast(pl.Date))\n", " \n", " # PK\n", " sample_data = sample_data.with_columns((pl.col('datetime').cast(pl.String)+\"_I\"+pl.col('idelem').cast(pl.String)+\"_S\"+pl.col('subarea').cast(pl.String)).alias('PK'))\n", " \n", " # Nivel servicio\n", " sample_data = sample_data.filter(pl.col('nivelServicio')>=0)\n", " \n", " # Intensidad sat\n", " sample_data = sample_data.filter(pl.col('intensidadSat').is_not_null())\n", " \n", " # Acceso asociado\n", " sample_data = sample_data.drop('accesoAsociado')\n", " \n", " return sample_data.collect()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data = pl.scan_csv(os.path.join(INFORMO_DATA_PATH, \"2024\", \"03\", f\"informo_202403.csv\"),ignore_errors=True)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "data.head().collect()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list_day = ['02','03','04','05','06','07','08','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "for day in list_day:\n", " print(day)\n", " path_folder = os.path.join(INFORMO_TRAIN_PATH, day)\n", " path_file = os.path.join(INFORMO_DATA_PATH,day)\n", " if not os.path.exists(path_folder):\n", " os.mkdir(path_folder)\n", " \n", " sample_data = data.filter(pl.col('date')==f\"2024-03-{day}\")\n", " sample_data = create_final_dataset(sample_data)\n", " sample_data.to_pandas().to_csv(f'{INFORMO_TRAIN_PATH}/{day}/emt_202403{day}.csv')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "language_info": { "name": "python" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }