{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Script to scrape Argentina Primera Division stats for the year of 2023\n", "Author: Luis Mejia\n", "
    \n", "
  1. Imports and declarations
  2. \n", "
  3. Get the HTML code from Wikipedia
  4. \n", "
  5. Scrape JSON file and load data into their own DataFrames
  6. \n", "
  7. Insert all DataFrames into my personal Google BigQuery project
  8. \n", "
" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import requests\n", "from datetime import date\n", "from google.cloud import bigquery\n", "import locale\n", "\n", "client = bigquery.Client.from_service_account_json(r'service_account.json')\n", "today = date.today()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "# create a mapping from Spanish to English month names\n", "month_mapping = {'enero': 'january',\n", " 'febrero': 'february',\n", " 'marzo': 'march',\n", " 'abril': 'april',\n", " 'mayo': 'may',\n", " 'junio': 'june',\n", " 'julio': 'july',\n", " 'agosto': 'august',\n", " 'septiembre': 'september',\n", " 'octubre': 'october',\n", " 'noviembre': 'november',\n", " 'diciembre': 'december'}\n", "\n", "# define a function to convert the Spanish month names to English\n", "def convert_month(fecha):\n", " fecha_parts = fecha.split()\n", " if len(fecha_parts) == 2:\n", " day = fecha_parts[0][:2]\n", " spanish_month = fecha_parts[0][2:] + \" \" + fecha_parts[1]\n", " else:\n", " day = fecha_parts[0]\n", " spanish_month = fecha_parts[2].lower()\n", " english_month = month_mapping.get(spanish_month.lower())\n", " if english_month is None:\n", " return fecha\n", " return f\"{day} {english_month} 2023\"" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "def import_to_gbq(df, destination_table):\n", " try:\n", " ## If you want the column and row count then\n", " table_id = destination_table\n", " job_config = bigquery.LoadJobConfig(\n", " schema=[\n", " ],\n", " write_disposition=\"WRITE_TRUNCATE\",\n", " )\n", "\n", " job = client.load_table_from_dataframe(\n", " df, table_id, job_config=job_config\n", " ) # Make an API request.\n", "\n", " job.result() # Wait for the job to complete.\n", " table = client.get_table(table_id) # Make an API request.\n", " print(\"Loaded {} rows and {} columns to {}\".format(table.num_rows, len(table.schema), table_id))\n", " except Exception as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "url = 'https://es.wikipedia.org/wiki/Campeonato_de_Primera_Divisi%C3%B3n_2023_(Argentina)'\n", "response = requests.request(\"GET\", url)\n", "df = pd.read_html(response.text.encode('utf-8'))" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "#Loop through all the game dates tables and get the scores/schedule for each, and lastly append all of them into df_resultados_new dataframe\n", "try:\n", " resultados_list = []\n", " fechas = range(11,38)\n", " for fecha in fechas:\n", " resultados = pd.DataFrame(df[fecha].values, columns = df[fecha].columns.get_level_values(1)) \n", " resultados_list.append(resultados)\n", " \n", " df_resultados_new = pd.concat(resultados_list)\n", " df_resultados_new = df_resultados_new.rename(\n", " columns={\n", " 'Local': 'equipo_local', \n", " 'Resultado': 'resultado', \n", " 'Visitante': 'equipo_visitante',\n", " 'Capacidad': 'capacidad',\n", " 'Estadio': 'estadio',\n", " 'Fecha': 'fecha',\n", " 'Hora': 'hora'\n", " })\n", " \n", " # df_resultados_new['fecha'] = df_resultados_new['fecha'].apply(convert_month)\n", " df_resultados_new = df_resultados_new.assign(pais='Argentina', liga='Primera Division', temporada=2023, archive_date=today)\n", "\n", " df_resultados_new['fecha_juego'] = df_resultados_new['fecha'] + ' ' + str('2023')\n", " df_resultados_new['fecha_juego'] = df_resultados_new['fecha_juego'].apply(convert_month)\n", " df_resultados_new['fecha_juego'] = pd.to_datetime(df_resultados_new['fecha_juego'], format='%d %B %Y')\n", "\n", " # df_resultados_new.to_csv(r'C:\\Users\\lmeji\\OneDrive\\Github\\Python\\Python-Scripts\\apis\\api_argentina\\csv\\fechas.csv',index=False)\n", "except Exception as e:\n", " print(e) " ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "#Get all other dataframes ready before going into BigQuery\n", "\n", "#Tabla de posiciones\n", "df_tabla_posiciones = df[8]\n", "df_tabla_posiciones['Dif.'] = pd.to_numeric(df_tabla_posiciones['Dif.'].replace('−', '-', regex=True))\n", "df_tabla_posiciones['Pos.'] = pd.to_numeric(df_tabla_posiciones['Pos.'].replace('.º', '', regex=True))\n", "df_tabla_posiciones = df_tabla_posiciones.assign(pais='Argentina', liga='Primera Division', temporada=2023, archive_date=today)\n", "\n", "df_tabla_posiciones = df_tabla_posiciones.rename(\n", " columns={\n", " 'Pos.': 'posicion', \n", " 'Equipo': 'equipo', \n", " 'Pts.': 'puntos',\n", " 'PJ': 'partidos_jugados',\n", " 'PG': 'partidos_ganados',\n", " 'PE': 'partidos_empatados',\n", " 'PP': 'partidos_perdidos',\n", " 'GF': 'goles_favor',\n", " 'GC': 'goles_contra',\n", " 'Dif.': 'diferencia',\n", " 'Pais': 'pais',\n", " 'Liga': 'liga',\n", " 'Temporada': 'temporada',\n", " 'Archive_date': 'archive_date',\n", " })\n", "\n", "#Equipos Participantes\n", "df_equipos_participantes = df[6]\n", "df_equipos_participantes = df_equipos_participantes.assign(pais='Argentina', liga='Primera Division', temporada=2023, archive_date=today)\n", "df_equipos_participantes = df_equipos_participantes.rename(\n", " columns={\n", " 'Equipo.': 'equipo', \n", " 'Ciudad': 'ciudad', \n", " 'Estadio.': 'estadio',\n", " 'Capacidad': 'capacidad',\n", " 'Pais': 'pais',\n", " 'Liga': 'liga',\n", " 'Temporada': 'temporada',\n", " 'Archive_date': 'archive_date'\n", " })" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded 28 rows and 14 columns to fleet-parser-330316.luistest.futbol_tabla_posiciones\n", "Loaded 28 rows and 8 columns to fleet-parser-330316.luistest.futbol_equipos_participantes\n", "Loaded 378 rows and 11 columns to fleet-parser-330316.luistest.futbol_resultados\n" ] } ], "source": [ "import_to_gbq(df_tabla_posiciones, 'fleet-parser-330316.luistest.futbol_tabla_posiciones')\n", "import_to_gbq(df_equipos_participantes, 'fleet-parser-330316.luistest.futbol_equipos_participantes')\n", "import_to_gbq(df_resultados_new, 'fleet-parser-330316.luistest.futbol_resultados')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "9843bc1ea15960b23d1129215741697ef4ecad5270763ed37801bbf4f937957f" } } }, "nbformat": 4, "nbformat_minor": 2 }