{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Ejemplo de WebScraping con Python\n", "## Obtener Ibex35 bolsa de Madrid" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:53:23.662759Z", "start_time": "2019-01-27T17:53:23.651938Z" } }, "outputs": [], "source": [ "# import libraries\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import csv\n", "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:38:57.659158Z", "start_time": "2019-01-27T17:38:57.655785Z" } }, "outputs": [], "source": [ "# indicar la ruta\n", "url_page = 'http://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:40:11.127856Z", "start_time": "2019-01-27T17:40:10.642601Z" } }, "outputs": [], "source": [ "# tarda 480 milisegundos\n", "page = requests.get(url_page).text \n", "soup = BeautifulSoup(page, \"lxml\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:42:00.515429Z", "start_time": "2019-01-27T17:42:00.504178Z" } }, "outputs": [], "source": [ "# Obtenemos la tabla por un ID específico\n", "tabla = soup.find('table', attrs={'id': 'ctl00_Contenido_tblÍndices'})\n", "tabla" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:55:08.443093Z", "start_time": "2019-01-27T17:55:08.436365Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Indice: IBEX 35®\n", "Valor: 9.185,20\n" ] } ], "source": [ "name=\"\"\n", "price=\"\"\n", "nroFila=0\n", "for fila in tabla.find_all(\"tr\"):\n", " if nroFila==1:\n", " nroCelda=0\n", " for celda in fila.find_all('td'):\n", " if nroCelda==0:\n", " name=celda.text\n", " print(\"Indice:\", name)\n", " if nroCelda==2:\n", " price=celda.text\n", " print(\"Valor:\", price)\n", " nroCelda=nroCelda+1\n", " nroFila=nroFila+1" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:56:28.279786Z", "start_time": "2019-01-27T17:56:28.271938Z" } }, "outputs": [], "source": [ "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n", "with open('bolsa_ibex35.csv', 'a') as csv_file:\n", " writer = csv.writer(csv_file)\n", " writer.writerow([name, price, datetime.now()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Obtener resultados de Futbol\n", "## Ejemplo Liga BBVA - España - Primera - desde marcadores.com" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:26:04.881325Z", "start_time": "2019-01-27T18:26:04.877803Z" } }, "outputs": [], "source": [ "url_page = 'https://www.marcadores.com/futbol/espana/liga-bbva/?competitionRoundId=486942' # jornada 20" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:26:07.064454Z", "start_time": "2019-01-27T18:26:05.493551Z" } }, "outputs": [], "source": [ "# tarda 1500 milisegundos\n", "page = requests.get(url_page).text \n", "soup = BeautifulSoup(page, \"lxml\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:31:00.402982Z", "start_time": "2019-01-27T18:31:00.299063Z" } }, "outputs": [], "source": [ "# Obtenemos la tabla por un ID específico\n", "tabla = soup.find('table', attrs={'class': 'matches'})\n", "tabla" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:33:29.634584Z", "start_time": "2019-01-27T18:33:29.621584Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Partido: Getafe vs Alavés (1 - 0)\n", "Partido: Real Madrid vs Sevilla (0 - 0)\n", "Partido: Huesca vs Atlético de Madrid (0 - 1)\n", "Partido: Celta vs Valencia (1 - 0)\n", "Partido: Betis vs Girona (1 - 2)\n", "Partido: Villarreal vs Athletic Bilbao (0 - 1)\n", "Partido: Rayo Vallecano vs Real Sociedad (2 - 1)\n", "Partido: Levante vs Valladolid (1 - 0)\n", "Partido: Barcelona vs Leganés (1 - 0)\n", "Partido: Eibar vs Espanyol (1 - 0)\n" ] } ], "source": [ "data = []\n", "equipo1=\"\"\n", "equipo2=\"\"\n", "resultado=\"\"\n", "nroFila=0\n", "for fila in tabla.find_all(\"tr\"):\n", " if nroFila>0:\n", " nroCelda=0\n", " capturar=False\n", " for celda in fila.find_all('td'):\n", " if nroCelda==1 and celda.text=='Fin.':\n", " capturar=True\n", " if capturar and nroCelda==2:\n", " equipo1=celda.text\n", " if capturar and nroCelda==4:\n", " equipo2=celda.text\n", " if capturar and nroCelda==5:\n", " resultado=celda.text\n", " print(\"Partido:\", equipo1,'vs',equipo2,resultado)\n", " data.append((equipo1,equipo2,resultado))\n", " nroCelda=nroCelda+1\n", " nroFila=nroFila+1" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:34:52.245736Z", "start_time": "2019-01-27T18:34:52.233185Z" } }, "outputs": [], "source": [ "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n", "with open('partidos_liga_primera.csv', 'a') as csv_file:\n", " writer = csv.writer(csv_file)\n", " for equipo1, equipo2,resultado in data:\n", " writer.writerow([equipo1, equipo2, resultado,datetime.now()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Otros ejemplos de WebScaping" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:37.428233Z", "start_time": "2019-01-28T18:12:37.420625Z" } }, "outputs": [], "source": [ "#supongamos tenemos el siguiente HTML\n", "pagina_web = \"\" \\\n", " + \"
\" \\\n", " + \"\" \\\n", " + \"