{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Ejemplo de WebScraping con Python\n",
    "## Obtener Ibex35 bolsa de Madrid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T17:53:23.662759Z",
     "start_time": "2019-01-27T17:53:23.651938Z"
    }
   },
   "outputs": [],
   "source": [
    "# import libraries\n",
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import csv\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T17:38:57.659158Z",
     "start_time": "2019-01-27T17:38:57.655785Z"
    }
   },
   "outputs": [],
   "source": [
    "# indicar la ruta\n",
    "url_page = 'http://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T17:40:11.127856Z",
     "start_time": "2019-01-27T17:40:10.642601Z"
    }
   },
   "outputs": [],
   "source": [
    "# tarda 480 milisegundos\n",
    "page = requests.get(url_page).text \n",
    "soup = BeautifulSoup(page, \"lxml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T17:42:00.515429Z",
     "start_time": "2019-01-27T17:42:00.504178Z"
    }
   },
   "outputs": [],
   "source": [
    "# Obtenemos la tabla por un ID específico\n",
    "tabla = soup.find('table', attrs={'id': 'ctl00_Contenido_tblÍndices'})\n",
    "tabla"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T17:55:08.443093Z",
     "start_time": "2019-01-27T17:55:08.436365Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Indice: IBEX 35®\n",
      "Valor: 9.185,20\n"
     ]
    }
   ],
   "source": [
    "name=\"\"\n",
    "price=\"\"\n",
    "nroFila=0\n",
    "for fila in tabla.find_all(\"tr\"):\n",
    "    if nroFila==1:\n",
    "        nroCelda=0\n",
    "        for celda in fila.find_all('td'):\n",
    "            if nroCelda==0:\n",
    "                name=celda.text\n",
    "                print(\"Indice:\", name)\n",
    "            if nroCelda==2:\n",
    "                price=celda.text\n",
    "                print(\"Valor:\", price)\n",
    "            nroCelda=nroCelda+1\n",
    "    nroFila=nroFila+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T17:56:28.279786Z",
     "start_time": "2019-01-27T17:56:28.271938Z"
    }
   },
   "outputs": [],
   "source": [
    "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n",
    "with open('bolsa_ibex35.csv', 'a') as csv_file:\n",
    "    writer = csv.writer(csv_file)\n",
    "    writer.writerow([name, price, datetime.now()])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Obtener resultados de Futbol\n",
    "## Ejemplo Liga BBVA - España - Primera -  desde marcadores.com"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T18:26:04.881325Z",
     "start_time": "2019-01-27T18:26:04.877803Z"
    }
   },
   "outputs": [],
   "source": [
    "url_page = 'https://www.marcadores.com/futbol/espana/liga-bbva/?competitionRoundId=486942' # jornada 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T18:26:07.064454Z",
     "start_time": "2019-01-27T18:26:05.493551Z"
    }
   },
   "outputs": [],
   "source": [
    "# tarda 1500 milisegundos\n",
    "page = requests.get(url_page).text \n",
    "soup = BeautifulSoup(page, \"lxml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T18:31:00.402982Z",
     "start_time": "2019-01-27T18:31:00.299063Z"
    }
   },
   "outputs": [],
   "source": [
    "# Obtenemos la tabla por un ID específico\n",
    "tabla = soup.find('table', attrs={'class': 'matches'})\n",
    "tabla"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T18:33:29.634584Z",
     "start_time": "2019-01-27T18:33:29.621584Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Partido: Getafe vs Alavés (1 - 0)\n",
      "Partido: Real Madrid vs Sevilla (0 - 0)\n",
      "Partido: Huesca vs Atlético de Madrid (0 - 1)\n",
      "Partido: Celta vs Valencia (1 - 0)\n",
      "Partido: Betis vs Girona (1 - 2)\n",
      "Partido: Villarreal vs Athletic Bilbao (0 - 1)\n",
      "Partido: Rayo Vallecano vs Real Sociedad (2 - 1)\n",
      "Partido: Levante vs Valladolid (1 - 0)\n",
      "Partido: Barcelona vs Leganés (1 - 0)\n",
      "Partido: Eibar vs Espanyol (1 - 0)\n"
     ]
    }
   ],
   "source": [
    "data = []\n",
    "equipo1=\"\"\n",
    "equipo2=\"\"\n",
    "resultado=\"\"\n",
    "nroFila=0\n",
    "for fila in tabla.find_all(\"tr\"):\n",
    "    if nroFila>0:\n",
    "        nroCelda=0\n",
    "        capturar=False\n",
    "        for celda in fila.find_all('td'):\n",
    "            if nroCelda==1 and celda.text=='Fin.':\n",
    "                capturar=True\n",
    "            if capturar and nroCelda==2:\n",
    "                equipo1=celda.text\n",
    "            if capturar and nroCelda==4:\n",
    "                equipo2=celda.text\n",
    "            if capturar and nroCelda==5:\n",
    "                resultado=celda.text\n",
    "                print(\"Partido:\", equipo1,'vs',equipo2,resultado)\n",
    "                data.append((equipo1,equipo2,resultado))\n",
    "            nroCelda=nroCelda+1\n",
    "    nroFila=nroFila+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-27T18:34:52.245736Z",
     "start_time": "2019-01-27T18:34:52.233185Z"
    }
   },
   "outputs": [],
   "source": [
    "# Abrimos el csv con append para que pueda agregar contenidos al final del archivo\n",
    "with open('partidos_liga_primera.csv', 'a') as csv_file:\n",
    "    writer = csv.writer(csv_file)\n",
    "    for equipo1, equipo2,resultado in data:\n",
    "        writer.writerow([equipo1, equipo2, resultado,datetime.now()])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Otros ejemplos de WebScaping"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:37.428233Z",
     "start_time": "2019-01-28T18:12:37.420625Z"
    }
   },
   "outputs": [],
   "source": [
    "#supongamos tenemos el siguiente HTML\n",
    "pagina_web = \"<html>\" \\\n",
    "            + \"<head></head>\" \\\n",
    "            + \"<body>\" \\\n",
    "                + \"<div class='contenedor'>\" \\\n",
    "                    + \"<div id='123' name='bloque_bienvenida' class='verde'>\" \\\n",
    "                        + \"Bienvenido a mi web\" \\\n",
    "                    + \"</div>\" \\\n",
    "                + \"</div>\" \\\n",
    "            + \"</body>\" \\\n",
    "            + \"</html>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:38.100550Z",
     "start_time": "2019-01-28T18:12:38.096154Z"
    }
   },
   "outputs": [],
   "source": [
    "soup = BeautifulSoup(pagina_web, \"lxml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:38.677469Z",
     "start_time": "2019-01-28T18:12:38.670578Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Bienvenido a mi web'"
      ]
     },
     "execution_count": 95,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Obtener por ID:\n",
    "elTexto = soup.find('div', attrs={'id': '123'}).getText()\n",
    "print(elTexto)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:39.718413Z",
     "start_time": "2019-01-28T18:12:39.712544Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Bienvenido a mi web'"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Obtener por Clase CSS:\n",
    "elTexto = soup.find('div', attrs={'class': 'verde'}).getText()\n",
    "print(elTexto)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:40.673362Z",
     "start_time": "2019-01-28T18:12:40.667391Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Bienvenido a mi web'"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Obtener dentro de otra etiqueta anidado:\n",
    "elTexto = next(soup.div.children).getText() #con next obtiene primer \"hijo\"\n",
    "print(elTexto)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtener items de un listado"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:42.573969Z",
     "start_time": "2019-01-28T18:12:42.567849Z"
    }
   },
   "outputs": [],
   "source": [
    "#supongamos tenemos el siguiente HTML\n",
    "pagina_web = \"<html>\" \\\n",
    "    + \"<head></head>\" \\\n",
    "    + \"<body>\" \\\n",
    "        + \"<div class='contenedor'>\" \\\n",
    "            + \"<ul>\" \\\n",
    "                + \"<li>Perro</li>\" \\\n",
    "                + \"<li>Gato</li>\" \\\n",
    "                + \"<li>Tortuga</li>\" \\\n",
    "            + \"</ul>\" \\\n",
    "        + \"</div>\" \\\n",
    "    + \"</body>\" \\\n",
    "    + \"</html>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:43.383058Z",
     "start_time": "2019-01-28T18:12:43.378520Z"
    }
   },
   "outputs": [],
   "source": [
    "soup = BeautifulSoup(pagina_web, \"lxml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:44.100260Z",
     "start_time": "2019-01-28T18:12:44.094916Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Perro\n",
      "Gato\n",
      "Tortuga\n"
     ]
    }
   ],
   "source": [
    "for child in soup.ul.children:\n",
    "    print(child.getText())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-28T18:12:44.972925Z",
     "start_time": "2019-01-28T18:12:44.967460Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Perro\n",
      "Gato\n",
      "Tortuga\n"
     ]
    }
   ],
   "source": [
    "items = soup.find_all('li')\n",
    "for item in items:\n",
    "    print(item.getText())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Obtener Enlaces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-29T09:25:01.595660Z",
     "start_time": "2019-01-29T09:25:01.576485Z"
    }
   },
   "outputs": [],
   "source": [
    "#supongamos tenemos el siguiente HTML\n",
    "pagina_web = \"<html>\" \\\n",
    "    + \"<head></head>\" \\\n",
    "    + \"<body>\" \\\n",
    "        + \"<div class='contenedor'>\" \\\n",
    "            + \"<ul>\" \\\n",
    "                + \"<li><a href='http://www.google.com'>Google</a></li>\" \\\n",
    "                + \"<li><a href='http://www.yahoo.com'>Yahoo</a></li>\" \\\n",
    "                + \"<li><a href='http://www.bing.com'>Bing</a></li>\" \\\n",
    "            + \"</ul>\" \\\n",
    "        + \"</div>\" \\\n",
    "    + \"</body>\" \\\n",
    "    + \"</html>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-29T09:25:04.281499Z",
     "start_time": "2019-01-29T09:25:04.250643Z"
    }
   },
   "outputs": [],
   "source": [
    "soup = BeautifulSoup(pagina_web, \"lxml\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-29T09:25:43.006626Z",
     "start_time": "2019-01-29T09:25:42.996092Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "http://www.google.com\n",
      "http://www.yahoo.com\n",
      "http://www.bing.com\n"
     ]
    }
   ],
   "source": [
    "items = soup.find_all('a')\n",
    "for item in items:\n",
    "    print(item['href'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ejemplo completo Extraer enlaces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-01-29T09:37:57.684739Z",
     "start_time": "2019-01-29T09:37:57.069538Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://www.lifeder.com/personajes-historicos/\n",
      "https://www.lifeder.com/frases-de-albert-einstein/\n",
      "https://www.lifeder.com/aportaciones-isaac-newton/\n",
      "https://www.lifeder.com/frases-de-isaac-newton/\n",
      "https://www.lifeder.com/frases-de-stephen-hawking/\n",
      "https://www.lifeder.com/mujeres-famosas-historia/\n",
      "https://www.lifeder.com/aportaciones-galileo-galilei/\n",
      "https://www.lifeder.com/frases-de-galileo-galilei/\n",
      "https://www.lifeder.com/frases-de-charles-darwin/\n",
      "https://www.lifeder.com/aportaciones-kepler/\n",
      "https://www.lifeder.com/frases-de-thomas-edison/\n",
      "https://www.lifeder.com/frases-de-arquimedes/\n",
      "https://www.lifeder.com/frases-de-leonardo-da-vinci/\n",
      "https://www.lifeder.com/aportaciones-john-dalton/\n",
      "https://www.lifeder.com/daltonismo/\n",
      "https://www.lifeder.com/frases-de-rene-descartes/\n"
     ]
    }
   ],
   "source": [
    "url_page = 'https://www.lifeder.com/cientificos-famosos/'\n",
    "page = requests.get(url_page).text \n",
    "soup = BeautifulSoup(page, \"lxml\")\n",
    "contenido = soup.find('div', attrs={'class': 'td-post-content'})\n",
    "items = contenido.find_all('a')\n",
    "for item in items:\n",
    "    print(item['href'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "El artículo completo en www.aprendemachinelearning.com"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}