{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Ejemplo de WebScraping con Python\n", "## Obtener Ibex35 bolsa de Madrid" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:53:23.662759Z", "start_time": "2019-01-27T17:53:23.651938Z" } }, "outputs": [], "source": [ "# import libraries\n", "import requests\n", "from bs4 import BeautifulSoup\n", "import csv\n", "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:38:57.659158Z", "start_time": "2019-01-27T17:38:57.655785Z" } }, "outputs": [], "source": [ "# indicar la ruta\n", "url_page = 'http://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx'" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:40:11.127856Z", "start_time": "2019-01-27T17:40:10.642601Z" } }, "outputs": [], "source": [ "# tarda 480 milisegundos\n", "page = requests.get(url_page).text \n", "soup = BeautifulSoup(page, \"lxml\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:42:00.515429Z", "start_time": "2019-01-27T17:42:00.504178Z" } }, "outputs": [], "source": [ "# Obtenemos la tabla por un ID específico\n", "tabla = soup.find('table', attrs={'id': 'ctl00_Contenido_tblÍndices'})\n", "tabla" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:55:08.443093Z", "start_time": "2019-01-27T17:55:08.436365Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Indice: IBEX 35®\n", "Valor: 9.185,20\n" ] } ], "source": [ "name=\"\"\n", "price=\"\"\n", "nroFila=0\n", "for fila in tabla.find_all(\"tr\"):\n", " if nroFila==1:\n", " nroCelda=0\n", " for celda in fila.find_all('td'):\n", " if nroCelda==0:\n", " name=celda.text\n", " print(\"Indice:\", name)\n", " if nroCelda==2:\n", " price=celda.text\n", " print(\"Valor:\", price)\n", " nroCelda=nroCelda+1\n", " nroFila=nroFila+1" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T17:56:28.279786Z", "start_time": "2019-01-27T17:56:28.271938Z" } }, "outputs": [], "source": [ "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n", "with open('bolsa_ibex35.csv', 'a') as csv_file:\n", " writer = csv.writer(csv_file)\n", " writer.writerow([name, price, datetime.now()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Obtener resultados de Futbol\n", "## Ejemplo Liga BBVA - España - Primera - desde marcadores.com" ] }, { "cell_type": "code", "execution_count": 38, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:26:04.881325Z", "start_time": "2019-01-27T18:26:04.877803Z" } }, "outputs": [], "source": [ "url_page = 'https://www.marcadores.com/futbol/espana/liga-bbva/?competitionRoundId=486942' # jornada 20" ] }, { "cell_type": "code", "execution_count": 39, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:26:07.064454Z", "start_time": "2019-01-27T18:26:05.493551Z" } }, "outputs": [], "source": [ "# tarda 1500 milisegundos\n", "page = requests.get(url_page).text \n", "soup = BeautifulSoup(page, \"lxml\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:31:00.402982Z", "start_time": "2019-01-27T18:31:00.299063Z" } }, "outputs": [], "source": [ "# Obtenemos la tabla por un ID específico\n", "tabla = soup.find('table', attrs={'class': 'matches'})\n", "tabla" ] }, { "cell_type": "code", "execution_count": 47, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:33:29.634584Z", "start_time": "2019-01-27T18:33:29.621584Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Partido: Getafe vs Alavés (1 - 0)\n", "Partido: Real Madrid vs Sevilla (0 - 0)\n", "Partido: Huesca vs Atlético de Madrid (0 - 1)\n", "Partido: Celta vs Valencia (1 - 0)\n", "Partido: Betis vs Girona (1 - 2)\n", "Partido: Villarreal vs Athletic Bilbao (0 - 1)\n", "Partido: Rayo Vallecano vs Real Sociedad (2 - 1)\n", "Partido: Levante vs Valladolid (1 - 0)\n", "Partido: Barcelona vs Leganés (1 - 0)\n", "Partido: Eibar vs Espanyol (1 - 0)\n" ] } ], "source": [ "data = []\n", "equipo1=\"\"\n", "equipo2=\"\"\n", "resultado=\"\"\n", "nroFila=0\n", "for fila in tabla.find_all(\"tr\"):\n", " if nroFila>0:\n", " nroCelda=0\n", " capturar=False\n", " for celda in fila.find_all('td'):\n", " if nroCelda==1 and celda.text=='Fin.':\n", " capturar=True\n", " if capturar and nroCelda==2:\n", " equipo1=celda.text\n", " if capturar and nroCelda==4:\n", " equipo2=celda.text\n", " if capturar and nroCelda==5:\n", " resultado=celda.text\n", " print(\"Partido:\", equipo1,'vs',equipo2,resultado)\n", " data.append((equipo1,equipo2,resultado))\n", " nroCelda=nroCelda+1\n", " nroFila=nroFila+1" ] }, { "cell_type": "code", "execution_count": 49, "metadata": { "ExecuteTime": { "end_time": "2019-01-27T18:34:52.245736Z", "start_time": "2019-01-27T18:34:52.233185Z" } }, "outputs": [], "source": [ "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n", "with open('partidos_liga_primera.csv', 'a') as csv_file:\n", " writer = csv.writer(csv_file)\n", " for equipo1, equipo2,resultado in data:\n", " writer.writerow([equipo1, equipo2, resultado,datetime.now()])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Otros ejemplos de WebScaping" ] }, { "cell_type": "code", "execution_count": 93, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:37.428233Z", "start_time": "2019-01-28T18:12:37.420625Z" } }, "outputs": [], "source": [ "#supongamos tenemos el siguiente HTML\n", "pagina_web = \"\" \\\n", " + \"\" \\\n", " + \"\" \\\n", " + \"
\" \\\n", " + \"
\" \\\n", " + \"Bienvenido a mi web\" \\\n", " + \"
\" \\\n", " + \"
\" \\\n", " + \"\" \\\n", " + \"\"" ] }, { "cell_type": "code", "execution_count": 94, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:38.100550Z", "start_time": "2019-01-28T18:12:38.096154Z" } }, "outputs": [], "source": [ "soup = BeautifulSoup(pagina_web, \"lxml\")" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:38.677469Z", "start_time": "2019-01-28T18:12:38.670578Z" } }, "outputs": [ { "data": { "text/plain": [ "'Bienvenido a mi web'" ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Obtener por ID:\n", "elTexto = soup.find('div', attrs={'id': '123'}).getText()\n", "print(elTexto)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:39.718413Z", "start_time": "2019-01-28T18:12:39.712544Z" } }, "outputs": [ { "data": { "text/plain": [ "'Bienvenido a mi web'" ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Obtener por Clase CSS:\n", "elTexto = soup.find('div', attrs={'class': 'verde'}).getText()\n", "print(elTexto)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:40.673362Z", "start_time": "2019-01-28T18:12:40.667391Z" } }, "outputs": [ { "data": { "text/plain": [ "'Bienvenido a mi web'" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Obtener dentro de otra etiqueta anidado:\n", "elTexto = next(soup.div.children).getText() #con next obtiene primer \"hijo\"\n", "print(elTexto)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Obtener items de un listado" ] }, { "cell_type": "code", "execution_count": 98, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:42.573969Z", "start_time": "2019-01-28T18:12:42.567849Z" } }, "outputs": [], "source": [ "#supongamos tenemos el siguiente HTML\n", "pagina_web = \"\" \\\n", " + \"\" \\\n", " + \"\" \\\n", " + \"
\" \\\n", " + \"\" \\\n", " + \"
\" \\\n", " + \"\" \\\n", " + \"\"" ] }, { "cell_type": "code", "execution_count": 99, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:43.383058Z", "start_time": "2019-01-28T18:12:43.378520Z" } }, "outputs": [], "source": [ "soup = BeautifulSoup(pagina_web, \"lxml\")" ] }, { "cell_type": "code", "execution_count": 100, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:44.100260Z", "start_time": "2019-01-28T18:12:44.094916Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Perro\n", "Gato\n", "Tortuga\n" ] } ], "source": [ "for child in soup.ul.children:\n", " print(child.getText())" ] }, { "cell_type": "code", "execution_count": 101, "metadata": { "ExecuteTime": { "end_time": "2019-01-28T18:12:44.972925Z", "start_time": "2019-01-28T18:12:44.967460Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Perro\n", "Gato\n", "Tortuga\n" ] } ], "source": [ "items = soup.find_all('li')\n", "for item in items:\n", " print(item.getText())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Obtener Enlaces" ] }, { "cell_type": "code", "execution_count": 102, "metadata": { "ExecuteTime": { "end_time": "2019-01-29T09:25:01.595660Z", "start_time": "2019-01-29T09:25:01.576485Z" } }, "outputs": [], "source": [ "#supongamos tenemos el siguiente HTML\n", "pagina_web = \"\" \\\n", " + \"\" \\\n", " + \"\" \\\n", " + \"
\" \\\n", " + \"\" \\\n", " + \"
\" \\\n", " + \"\" \\\n", " + \"\"" ] }, { "cell_type": "code", "execution_count": 103, "metadata": { "ExecuteTime": { "end_time": "2019-01-29T09:25:04.281499Z", "start_time": "2019-01-29T09:25:04.250643Z" } }, "outputs": [], "source": [ "soup = BeautifulSoup(pagina_web, \"lxml\")" ] }, { "cell_type": "code", "execution_count": 105, "metadata": { "ExecuteTime": { "end_time": "2019-01-29T09:25:43.006626Z", "start_time": "2019-01-29T09:25:42.996092Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "http://www.google.com\n", "http://www.yahoo.com\n", "http://www.bing.com\n" ] } ], "source": [ "items = soup.find_all('a')\n", "for item in items:\n", " print(item['href'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Ejemplo completo Extraer enlaces" ] }, { "cell_type": "code", "execution_count": 107, "metadata": { "ExecuteTime": { "end_time": "2019-01-29T09:37:57.684739Z", "start_time": "2019-01-29T09:37:57.069538Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://www.lifeder.com/personajes-historicos/\n", "https://www.lifeder.com/frases-de-albert-einstein/\n", "https://www.lifeder.com/aportaciones-isaac-newton/\n", "https://www.lifeder.com/frases-de-isaac-newton/\n", "https://www.lifeder.com/frases-de-stephen-hawking/\n", "https://www.lifeder.com/mujeres-famosas-historia/\n", "https://www.lifeder.com/aportaciones-galileo-galilei/\n", "https://www.lifeder.com/frases-de-galileo-galilei/\n", "https://www.lifeder.com/frases-de-charles-darwin/\n", "https://www.lifeder.com/aportaciones-kepler/\n", "https://www.lifeder.com/frases-de-thomas-edison/\n", "https://www.lifeder.com/frases-de-arquimedes/\n", "https://www.lifeder.com/frases-de-leonardo-da-vinci/\n", "https://www.lifeder.com/aportaciones-john-dalton/\n", "https://www.lifeder.com/daltonismo/\n", "https://www.lifeder.com/frases-de-rene-descartes/\n" ] } ], "source": [ "url_page = 'https://www.lifeder.com/cientificos-famosos/'\n", "page = requests.get(url_page).text \n", "soup = BeautifulSoup(page, \"lxml\")\n", "contenido = soup.find('div', attrs={'class': 'td-post-content'})\n", "items = contenido.find_all('a')\n", "for item in items:\n", " print(item['href'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "El artículo completo en www.aprendemachinelearning.com" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.5" } }, "nbformat": 4, "nbformat_minor": 2 }