# EMT

# Generación de los dataset para cada día

In [None]:
EMT_TRAIN_PATH = os.path.join(ROOT_PATH, "data",'train','emt','2024','03')

## Dataset final

In [None]:
def create_final_dataset(sample_data):
 sample_data = sample_data.with_columns((pl.col('datetime').cast(pl.String)+"_B"+pl.col('bus').cast(pl.String)+"_L"+ pl.col('line').cast(pl.String)+"_S"+pl.col('stop').cast(pl.String)).alias('PK'))
 
 # ETA <2400
 sample_data = sample_data.filter(pl.col('estimateArrive')<888888)
 sample_data = sample_data.group_by('PK').min()
 
 sample_data = sample_data.with_columns(pl.col("date").cast(pl.Date),pl.col('isHead').cast(pl.UInt8))
 
 sample_data = sample_data.with_columns(pl.col('datetime').map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")))
 
 # Rellenamos valores nulos de dayType
 sample_data = sample_data.with_columns(pl.when(pl.col('dayType').is_null()).then(pl.col('date').apply(get_type_day)).otherwise(pl.col('dayType')).alias('dayType'))
 
 # Eliminamos variables
 sample_data = sample_data.drop('positionTypeBus','deviation','MaximumFrequency','StartTime','StopTime','strike')
 

 
 return sample_data.collect()

## Dataset auxiliar

In [None]:
def calculate_predict_arrival_date(date_datetime,second):
 new_date_datetime = date_datetime + timedelta(seconds=second)
 
 return new_date_datetime

In [None]:
def get_interval_time(date_datetime):
 hour= int(date_datetime.hour)
 minute = int(date_datetime.minute)
 
 return [hour - 1, hour + 1]

In [None]:
def create_auxiliar_dataset(sample_data):
 
 sample_data_aux = sample_data.with_columns(pl.struct(datetime = pl.col('datetime'), estimateArrive = pl.col('estimateArrive').alias('struct')).map_elements(lambda x: calculate_predict_arrival_date(x['datetime'], x['estimateArrive'])).alias('predict_arrival_date'))
 
 sample_data_aux = sample_data_aux.with_columns(pl.col('datetime').apply(get_interval_time).alias('interval_time'))
 
 small_sample_data = sample_data_aux.filter(pl.col('estimateArrive')<=60).group_by(pl.col('bus'),pl.col('line'),pl.col('stop'),pl.col('destination'),pl.col('date'),pl.col('interval_time')).min().with_columns(pl.col('predict_arrival_date').alias('reliable_arrival_date'))
 
 final_sample_data = sample_data_aux.join(small_sample_data,on=[pl.col('bus'),pl.col('line'),pl.col('stop'),pl.col('destination'),pl.col('date'),pl.col('interval_time')],how = 'left')
 
 
 final_sample_data = final_sample_data.filter(pl.col('reliable_arrival_date').is_not_null())
 
 final_sample_data = final_sample_data.select(pl.col('PK'),pl.col('reliable_arrival_date'),pl.col('predict_arrival_date'),pl.col('interval_time'),pl.col('estimateArrive'))
 
 return final_sample_data

In [None]:
data = pl.scan_csv(os.path.join(EMT_DATA_PATH, "2024", "03", f"emt_202403.csv"))
list_day = ['02','03','04','05','06','07','08','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']

In [None]:
for day in list_day:
 print(day)
 path_folder = os.path.join(EMT_TRAIN_PATH, day)
 path_file = os.path.join(EMT_DATA_PATH,day)
 if not os.path.exists(path_folder):
 os.mkdir(path_folder)
 
 sample_data = data.filter(pl.col('date')==f"2024-03-{day}")
 sample_data = create_final_dataset(sample_data)
 sample_data.to_pandas().to_csv(f'{EMT_TRAIN_PATH}/{day}/emt_202403{day}.csv')
 
 sample_data_aux = create_auxiliar_dataset(sample_data) 
 sample_data_aux.to_pandas().to_csv(f'{EMT_TRAIN_PATH}/{day}/emt_202403{day}_aux.csv')

# Informo

## Dataset final

In [None]:
INFORMO_TRAIN_PATH = os.path.join(ROOT_PATH, "data",'train','informo','2024','03')

In [None]:
def create_final_dataset(sample_data):
 # Preproc previo
 sample_data = sample_data.filter(pl.col('idelem').is_not_null(),pl.col('error')=='N').drop('velocidad','error')
 
 # Variable type
 sample_data = sample_data.with_columns(
 pl.col('datetime').map_elements(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")),
 pl.col('date').cast(pl.Date))
 
 # PK
 sample_data = sample_data.with_columns((pl.col('datetime').cast(pl.String)+"_I"+pl.col('idelem').cast(pl.String)+"_S"+pl.col('subarea').cast(pl.String)).alias('PK'))
 
 # Nivel servicio
 sample_data = sample_data.filter(pl.col('nivelServicio')>=0)
 
 # Intensidad sat
 sample_data = sample_data.filter(pl.col('intensidadSat').is_not_null())
 
 # Acceso asociado
 sample_data = sample_data.drop('accesoAsociado')
 
 return sample_data.collect()

In [None]:
data = pl.scan_csv(os.path.join(INFORMO_DATA_PATH, "2024", "03", f"informo_202403.csv"),ignore_errors=True)

In [None]:
data.head().collect()

In [None]:
list_day = ['02','03','04','05','06','07','08','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31']

In [None]:
for day in list_day:
 print(day)
 path_folder = os.path.join(INFORMO_TRAIN_PATH, day)
 path_file = os.path.join(INFORMO_DATA_PATH,day)
 if not os.path.exists(path_folder):
 os.mkdir(path_folder)
 
 sample_data = data.filter(pl.col('date')==f"2024-03-{day}")
 sample_data = create_final_dataset(sample_data)
 sample_data.to_pandas().to_csv(f'{INFORMO_TRAIN_PATH}/{day}/emt_202403{day}.csv')