{ "cells": [ { "cell_type": "markdown", "id": "63d1c385", "metadata": {}, "source": [ "# Importing Necessary Libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "a9d156f7", "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup as bs" ] }, { "cell_type": "markdown", "id": "45cbc274", "metadata": {}, "source": [ "## Loading our First Page" ] }, { "cell_type": "code", "execution_count": 5, "id": "07c70558", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<html>\n", " <head>\n", " <title>\n", " HTML Example\n", " </title>\n", " </head>\n", " <body>\n", " <div align=\"middle\">\n", " <h1>\n", " HTML Webpage\n", " </h1>\n", " <p>\n", " Link to more interesting example:\n", " <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">\n", " keithgalli.github.io/web-scraping/webpage.html\n", " </a>\n", " </p>\n", " </div>\n", " <h2>\n", " A Header\n", " </h2>\n", " <p>\n", " <i>\n", " Some italicized text\n", " </i>\n", " </p>\n", " <h2>\n", " Another header\n", " </h2>\n", " <p id=\"paragraph-id\">\n", " <b>\n", " Some bold text\n", " </b>\n", " </p>\n", " </body>\n", "</html>\n", "\n" ] } ], "source": [ "# Load the webpage content\n", "r = requests.get('https://keithgalli.github.io/web-scraping/example.html')\n", "\n", "# Convert to a beautiful soup object\n", "soup = bs(r.content)\n", "\n", "# Print out our html\n", "print(soup.prettify())" ] }, { "cell_type": "markdown", "id": "8b5ee527", "metadata": {}, "source": [ "## Start using BeautifulSoup to Scrape" ] }, { "cell_type": "code", "execution_count": 8, "id": "d00bfd42", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<h2>A Header</h2>" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_header = soup.find('h2')\n", "first_header" ] }, { "cell_type": "code", "execution_count": 9, "id": "719ef46f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<h2>A Header</h2>, <h2>Another header</h2>]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "headers = soup.find_all('h2')\n", "headers" ] }, { "cell_type": "code", "execution_count": 10, "id": "e3e42496", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<h1>HTML Webpage</h1>" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Pass in a list of elements to look for\n", "first_header = soup.find([\"h1\", \"h2\"])\n", "first_header" ] }, { "cell_type": "code", "execution_count": 11, "id": "75836624", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<h1>HTML Webpage</h1>" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "first_header = soup.find([\"h2\", \"h1\"])\n", "first_header" ] }, { "cell_type": "code", "execution_count": 12, "id": "217c5c4a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "headers = soup.find_all([\"h1\", \"h2\"])\n", "headers" ] }, { "cell_type": "code", "execution_count": 16, "id": "8b92bac4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<p id=\"paragraph-id\"><b>Some bold text</b></p>]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can pass in attributes to the find/find_all function\n", "paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})\n", "paragraph" ] }, { "cell_type": "code", "execution_count": 19, "id": "afc21ba4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<h1>HTML Webpage</h1>" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# You can nest find/find_all calls\n", "body = soup.find('body')\n", "div = body.find('div')\n", "header = div.find('h1')\n", "header" ] }, { "cell_type": "code", "execution_count": 21, "id": "a14114bd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<p><i>Some italicized text</i></p>,\n", " <p id=\"paragraph-id\"><b>Some bold text</b></p>]" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We can search specific strings in our find/find_all calls\n", "\n", "import re\n", "\n", "para = soup.find_all('p', string=re.compile('Some'))\n", "para" ] }, { "cell_type": "code", "execution_count": 22, "id": "27fd9a5f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<h2>A Header</h2>, <h2>Another header</h2>]" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head = soup.find_all('h2', string=re.compile('(H|h)eader'))\n", "head" ] }, { "cell_type": "markdown", "id": "3b0969d2", "metadata": {}, "source": [ "## Select (CSS Selector)" ] }, { "cell_type": "code", "execution_count": 24, "id": "6e739102", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "content = soup.select('div p')\n", "content" ] }, { "cell_type": "code", "execution_count": 25, "id": "ed6a8300", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<p><i>Some italicized text</i></p>,\n", " <p id=\"paragraph-id\"><b>Some bold text</b></p>]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pg = soup.select('h2 ~ p')\n", "pg" ] }, { "cell_type": "code", "execution_count": 28, "id": "074f3d9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<b>Some bold text</b>]" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bold = soup.select('p#paragraph-id b')\n", "bold" ] }, { "cell_type": "code", "execution_count": 40, "id": "f26127d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[<p><i>Some italicized text</i></p>, <p id=\"paragraph-id\"><b>Some bold text</b></p>]\n" ] } ], "source": [ "paras = soup.select('body > p')\n", "print(paras)" ] }, { "cell_type": "code", "execution_count": 41, "id": "a7083144", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[<i>Some italicized text</i>]\n", "[]\n" ] } ], "source": [ "for para in paras:\n", " print(para.select(\"i\"))" ] }, { "cell_type": "code", "execution_count": 35, "id": "8176ba6b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<div align=\"middle\">\n", " <h1>HTML Webpage</h1>\n", " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n", " </div>]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Grab by element with specific property\n", "soup.select(\"[align=middle]\")" ] }, { "cell_type": "markdown", "id": "4c641def", "metadata": {}, "source": [ "## Get different properties of the HTML" ] }, { "cell_type": "markdown", "id": "40e797ea", "metadata": {}, "source": [ "### Getting Strings from HTML" ] }, { "cell_type": "code", "execution_count": 45, "id": "6aa807ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'A Header'" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# use .string\n", "soup.find('h2').string" ] }, { "cell_type": "code", "execution_count": 49, "id": "48a94d75", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "HTML Webpage\n", "Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html\n", "\n" ] } ], "source": [ "# If multiple child elements use get_text\n", "div = soup.find('div')\n", "print(div.get_text())" ] }, { "cell_type": "markdown", "id": "5ce848c7", "metadata": {}, "source": [ "### Getting Links from HTML" ] }, { "cell_type": "code", "execution_count": 50, "id": "205c4bdf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://keithgalli.github.io/web-scraping/webpage.html'" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get a specific property from an element\n", "link = soup.find('a')\n", "link['href']" ] }, { "cell_type": "markdown", "id": "d1bac422", "metadata": {}, "source": [ "### Subsetting to get what you want from HTML" ] }, { "cell_type": "code", "execution_count": 51, "id": "39f5689d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'paragraph-id'" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "paragraphs = soup.select(\"p#paragraph-id\")\n", "paragraphs[0]['id']" ] }, { "cell_type": "markdown", "id": "b46c858c", "metadata": {}, "source": [ "## Code Navigation" ] }, { "cell_type": "code", "execution_count": 61, "id": "64b2fb25", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<body>\n", " <div align=\"middle\">\n", " <h1>HTML Webpage</h1>\n", " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n", " </div>\n", " <h2>A Header</h2>\n", " <p><i>Some italicized text</i></p>\n", " <h2>Another header</h2>\n", " <p id=\"paragraph-id\"><b>Some bold text</b></p>\n", " </body>,\n", " <html>\n", " <head>\n", " <title>HTML Example</title>\n", " </head>\n", " <body>\n", " <div align=\"middle\">\n", " <h1>HTML Webpage</h1>\n", " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n", " </div>\n", " <h2>A Header</h2>\n", " <p><i>Some italicized text</i></p>\n", " <h2>Another header</h2>\n", " <p id=\"paragraph-id\"><b>Some bold text</b></p>\n", " </body>\n", " </html>,\n", " <html>\n", " <head>\n", " <title>HTML Example</title>\n", " </head>\n", " <body>\n", " <div align=\"middle\">\n", " <h1>HTML Webpage</h1>\n", " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n", " </div>\n", " <h2>A Header</h2>\n", " <p><i>Some italicized text</i></p>\n", " <h2>Another header</h2>\n", " <p id=\"paragraph-id\"><b>Some bold text</b></p>\n", " </body>\n", " </html>]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Know the terms: Parent, Sibling, Child\n", "soup.body.find(\"div\").find_parents()" ] }, { "cell_type": "code", "execution_count": 62, "id": "fbaf64b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<body>\n", "<div align=\"middle\">\n", "<h1>HTML Webpage</h1>\n", "<p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n", "</div>\n", "<h2>A Header</h2>\n", "<p><i>Some italicized text</i></p>\n", "<h2>Another header</h2>\n", "<p id=\"paragraph-id\"><b>Some bold text</b></p>\n", "</body>" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.body.find(\"div\").find_parent()" ] }, { "cell_type": "code", "execution_count": 63, "id": "c4627f9d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.body.find(\"div\").find_previous_siblings()" ] }, { "cell_type": "code", "execution_count": 64, "id": "f6c923d1", "metadata": {}, "outputs": [], "source": [ "soup.body.find(\"div\").find_previous_sibling()" ] }, { "cell_type": "code", "execution_count": 59, "id": "4ce37173", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[<h2>A Header</h2>,\n", " <p><i>Some italicized text</i></p>,\n", " <h2>Another header</h2>,\n", " <p id=\"paragraph-id\"><b>Some bold text</b></p>]" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.body.find(\"div\").find_next_siblings()" ] }, { "cell_type": "code", "execution_count": 60, "id": "6135bb51", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "<h2>A Header</h2>" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "soup.body.find(\"div\").find_next_sibling()" ] }, { "cell_type": "markdown", "id": "0b1aea48", "metadata": {}, "source": [ "# Exercises" ] }, { "cell_type": "markdown", "id": "811b3fba", "metadata": {}, "source": [ "## Loading the webpage" ] }, { "cell_type": "code", "execution_count": 2, "id": "ae0a6a30", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "<html>\n", " <head>\n", " <title>\n", " Keith Galli's Page\n", " </title>\n", " <style>\n", " table {\n", " border-collapse: collapse;\n", " }\n", " th {\n", " padding:5px;\n", " }\n", " td {\n", " border: 1px solid #ddd;\n", " padding: 5px;\n", " }\n", " tr:nth-child(even) {\n", " background-color: #f2f2f2;\n", " }\n", " th {\n", " padding-top: 12px;\n", " padding-bottom: 12px;\n", " text-align: left;\n", " background-color: #add8e6;\n", " color: black;\n", " }\n", " .block {\n", " width: 100px;\n", " /*float: left;*/\n", " display: inline-block;\n", " zoom: 1;\n", " }\n", " .column {\n", " float: left;\n", " height: 200px;\n", " /*width: 33.33%;*/\n", " padding: 5px;\n", " }\n", "\n", " .row::after {\n", " content: \"\";\n", " clear: both;\n", " display: table;\n", " }\n", " </style>\n", " </head>\n", " <body>\n", " <h1>\n", " Welcome to my page!\n", " </h1>\n", " <img src=\"./images/selfie1.jpg\" width=\"300px\"/>\n", " <h2>\n", " About me\n", " </h2>\n", " <p>\n", " Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!\n", " </p>\n", " <p>\n", " Here is a link to my channel:\n", " <a href=\"https://www.youtube.com/kgmit\">\n", " youtube.com/kgmit\n", " </a>\n", " </p>\n", " <p>\n", " I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.\n", " </p>\n", " <h3>\n", " Hobbies\n", " </h3>\n", " <p>\n", " Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey & table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are\n", " <i>\n", " Settlers of Catan\n", " </i>\n", " and\n", " <i>\n", " Othello\n", " </i>\n", " .\n", " </p>\n", " <h3>\n", " Fun Facts\n", " </h3>\n", " <ul class=\"fun-facts\">\n", " <li>\n", " Owned my dream car in high school\n", " <a href=\"#footer\">\n", " <sup>\n", " 1\n", " </sup>\n", " </a>\n", " </li>\n", " <li>\n", " Middle name is Ronald\n", " </li>\n", " <li>\n", " Never had been on a plane until college\n", " </li>\n", " <li>\n", " Dunkin Donuts coffee is better than Starbucks\n", " </li>\n", " <li>\n", " A favorite book series of mine is\n", " <i>\n", " Ender's Game\n", " </i>\n", " </li>\n", " <li>\n", " Current video game of choice is\n", " <i>\n", " Rocket League\n", " </i>\n", " </li>\n", " <li>\n", " The band that I've seen the most times live is the\n", " <i>\n", " Zac Brown Band\n", " </i>\n", " </li>\n", " </ul>\n", " <h2>\n", " Social Media\n", " </h2>\n", " I encourage you to check out my content on all social media platforms\n", " <br/>\n", " <ul class=\"socials\">\n", " <li class=\"social instagram\">\n", " <b>\n", " Instagram:\n", " </b>\n", " <a href=\"https://www.instagram.com/keithgalli/\">\n", " https://www.instagram.com/keithgalli/\n", " </a>\n", " </li>\n", " <li class=\"social twitter\">\n", " <b>\n", " Twitter:\n", " </b>\n", " <a href=\"https://twitter.com/keithgalli\">\n", " https://twitter.com/keithgalli\n", " </a>\n", " </li>\n", " <li class=\"social linkedin\">\n", " <b>\n", " LinkedIn:\n", " </b>\n", " <a href=\"https://www.linkedin.com/in/keithgalli/\">\n", " https://www.linkedin.com/in/keithgalli/\n", " </a>\n", " </li>\n", " <li class=\"social tiktok\">\n", " <b>\n", " TikTok:\n", " </b>\n", " <a href=\"https://www.tiktok.com/@keithgalli\">\n", " https://www.tiktok.com/@keithgalli\n", " </a>\n", " </li>\n", " </ul>\n", " <h2>\n", " Photos\n", " </h2>\n", " Here are a few photos from a trip to italy I took last year\n", " <div class=\"row\">\n", " <div class=\"column\">\n", " <img alt=\"Lake Como\" src=\"images/italy/lake_como.jpg\" style=\"height:100%\"/>\n", " </div>\n", " <div class=\"column\">\n", " <img alt=\"Pontevecchio, Florence\" src=\"images/italy/pontevecchio.jpg\" style=\"height:100%\"/>\n", " </div>\n", " <div class=\"column\">\n", " <img alt=\"Riomaggiore, Cinque de Terre\" src=\"images/italy/riomaggiore.jpg\" style=\"height:100%\"/>\n", " </div>\n", " </div>\n", " <div>\n", " </div>\n", " <h2>\n", " Table\n", " </h2>\n", " My MIT hockey stats :)\n", " <br/>\n", " <table class=\"hockey-stats\">\n", " <thead>\n", " <tr>\n", " <th class=\"season\" data-sort=\"\">\n", " S\n", " </th>\n", " <th class=\"team\" data-sort=\"team\">\n", " Team\n", " </th>\n", " <th class=\"league\" data-sort=\"league\">\n", " League\n", " </th>\n", " <th class=\"regular gp\" data-sort=\"gp\">\n", " GP\n", " </th>\n", " <th class=\"regular g\" data-sort=\"g\">\n", " G\n", " </th>\n", " <th class=\"regular a\" data-sort=\"a\">\n", " A\n", " </th>\n", " <th class=\"regular tp\" data-sort=\"tp\">\n", " TP\n", " </th>\n", " <th class=\"regular pim\" data-sort=\"pim\">\n", " PIM\n", " </th>\n", " <th class=\"regular pm\" data-sort=\"pm\">\n", " +/-\n", " </th>\n", " <th class=\"separator\">\n", " </th>\n", " <th class=\"postseason\">\n", " POST\n", " </th>\n", " <th class=\"postseason gp\" data-sort=\"playoffs-gp\">\n", " GP\n", " </th>\n", " <th class=\"postseason g\" data-sort=\"playoffs-g\">\n", " G\n", " </th>\n", " <th class=\"postseason a\" data-sort=\"playoffs-a\">\n", " A\n", " </th>\n", " <th class=\"postseason tp\" data-sort=\"playoffs-tp\">\n", " TP\n", " </th>\n", " <th class=\"postseason pim\" data-sort=\"playoffs-pim\">\n", " PIM\n", " </th>\n", " <th class=\"postseason pm\" data-sort=\"playoffs-pm\">\n", " +/-\n", " </th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr class=\"team-continent-NA\">\n", " <td class=\"season sorted\">\n", " 2014-15\n", " </td>\n", " <td class=\"team\">\n", " <i>\n", " <img src=\"images/flag.png\"/>\n", " </i>\n", " <span class=\"txt-blue\">\n", " <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats\">\n", " MIT (Mass. Inst. of Tech.)\n", " </a>\n", " </span>\n", " </td>\n", " <td class=\"league\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2014-2015\">\n", " ACHA II\n", " </a>\n", " </td>\n", " <td class=\"regular gp\">\n", " 17\n", " </td>\n", " <td class=\"regular g\">\n", " 3\n", " </td>\n", " <td class=\"regular a\">\n", " 9\n", " </td>\n", " <td class=\"regular tp\">\n", " 12\n", " </td>\n", " <td class=\"regular pim\">\n", " 20\n", " </td>\n", " <td class=\"regular pm\">\n", " </td>\n", " <td class=\"separator\">\n", " |\n", " </td>\n", " <td class=\"postseason\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2014-2015\">\n", " </a>\n", " </td>\n", " <td class=\"postseason gp\">\n", " </td>\n", " <td class=\"postseason g\">\n", " </td>\n", " <td class=\"postseason a\">\n", " </td>\n", " <td class=\"postseason tp\">\n", " </td>\n", " <td class=\"postseason pim\">\n", " </td>\n", " <td class=\"postseason pm\">\n", " </td>\n", " </tr>\n", " <tr class=\"team-continent-NA\">\n", " <td class=\"season sorted\">\n", " 2015-16\n", " </td>\n", " <td class=\"team\">\n", " <i>\n", " <img src=\"images/flag.png\"/>\n", " </i>\n", " <span class=\"txt-blue\">\n", " <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats\">\n", " MIT (Mass. Inst. of Tech.)\n", " </a>\n", " </span>\n", " </td>\n", " <td class=\"league\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2015-2016\">\n", " ACHA II\n", " </a>\n", " </td>\n", " <td class=\"regular gp\">\n", " 9\n", " </td>\n", " <td class=\"regular g\">\n", " 1\n", " </td>\n", " <td class=\"regular a\">\n", " 1\n", " </td>\n", " <td class=\"regular tp\">\n", " 2\n", " </td>\n", " <td class=\"regular pim\">\n", " 2\n", " </td>\n", " <td class=\"regular pm\">\n", " </td>\n", " <td class=\"separator\">\n", " |\n", " </td>\n", " <td class=\"postseason\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2015-2016\">\n", " </a>\n", " </td>\n", " <td class=\"postseason gp\">\n", " </td>\n", " <td class=\"postseason g\">\n", " </td>\n", " <td class=\"postseason a\">\n", " </td>\n", " <td class=\"postseason tp\">\n", " </td>\n", " <td class=\"postseason pim\">\n", " </td>\n", " <td class=\"postseason pm\">\n", " </td>\n", " </tr>\n", " <tr class=\"team-continent-NA\">\n", " <td class=\"season sorted\">\n", " 2016-17\n", " </td>\n", " <td class=\"team\">\n", " <i>\n", " <img src=\"images/flag.png\"/>\n", " </i>\n", " <span class=\"txt-blue\">\n", " <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats\">\n", " MIT (Mass. Inst. of Tech.)\n", " </a>\n", " </span>\n", " </td>\n", " <td class=\"league\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2016-2017\">\n", " ACHA II\n", " </a>\n", " </td>\n", " <td class=\"regular gp\">\n", " 12\n", " </td>\n", " <td class=\"regular g\">\n", " 5\n", " </td>\n", " <td class=\"regular a\">\n", " 5\n", " </td>\n", " <td class=\"regular tp\">\n", " 10\n", " </td>\n", " <td class=\"regular pim\">\n", " 8\n", " </td>\n", " <td class=\"regular pm\">\n", " 0\n", " </td>\n", " <td class=\"separator\">\n", " |\n", " </td>\n", " <td class=\"postseason\">\n", " </td>\n", " <td class=\"postseason gp\">\n", " </td>\n", " <td class=\"postseason g\">\n", " </td>\n", " <td class=\"postseason a\">\n", " </td>\n", " <td class=\"postseason tp\">\n", " </td>\n", " <td class=\"postseason pim\">\n", " </td>\n", " <td class=\"postseason pm\">\n", " </td>\n", " </tr>\n", " <tr class=\"team-continent-EU\">\n", " <td class=\"season sorted\">\n", " 2017-18\n", " </td>\n", " <td class=\"team\">\n", " Did not play\n", " </td>\n", " <td class=\"league\">\n", " <a href=\"https://www.eliteprospects.com/stats\">\n", " </a>\n", " </td>\n", " <td class=\"regular gp\">\n", " </td>\n", " <td class=\"regular g\">\n", " </td>\n", " <td class=\"regular a\">\n", " </td>\n", " <td class=\"regular tp\">\n", " </td>\n", " <td class=\"regular pim\">\n", " </td>\n", " <td class=\"regular pm\">\n", " </td>\n", " <td class=\"separator\">\n", " |\n", " </td>\n", " <td class=\"postseason\">\n", " <a href=\"https://www.eliteprospects.com/stats\">\n", " </a>\n", " </td>\n", " <td class=\"postseason gp\">\n", " </td>\n", " <td class=\"postseason g\">\n", " </td>\n", " <td class=\"postseason a\">\n", " </td>\n", " <td class=\"postseason tp\">\n", " </td>\n", " <td class=\"postseason pim\">\n", " </td>\n", " <td class=\"postseason pm\">\n", " </td>\n", " </tr>\n", " <tr class=\"team-continent-NA\">\n", " <td class=\"season sorted\">\n", " 2018-19\n", " </td>\n", " <td class=\"team\">\n", " <i>\n", " <img src=\"images/flag.png\"/>\n", " </i>\n", " <span class=\"txt-blue\">\n", " <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats\">\n", " MIT (Mass. Inst. of Tech.)\n", " </a>\n", " </span>\n", " </td>\n", " <td class=\"league\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-iii/stats/2018-2019\">\n", " ACHA III\n", " </a>\n", " </td>\n", " <td class=\"regular gp\">\n", " 8\n", " </td>\n", " <td class=\"regular g\">\n", " 5\n", " </td>\n", " <td class=\"regular a\">\n", " 10\n", " </td>\n", " <td class=\"regular tp\">\n", " 15\n", " </td>\n", " <td class=\"regular pim\">\n", " 8\n", " </td>\n", " <td class=\"regular pm\">\n", " </td>\n", " <td class=\"separator\">\n", " |\n", " </td>\n", " <td class=\"postseason\">\n", " <a href=\"https://www.eliteprospects.com/league/acha-iii/stats/2018-2019\">\n", " </a>\n", " </td>\n", " <td class=\"postseason gp\">\n", " </td>\n", " <td class=\"postseason g\">\n", " </td>\n", " <td class=\"postseason a\">\n", " </td>\n", " <td class=\"postseason tp\">\n", " </td>\n", " <td class=\"postseason pim\">\n", " </td>\n", " <td class=\"postseason pm\">\n", " </td>\n", " </tr>\n", " </tbody>\n", " </table>\n", " <h2>\n", " Mystery Message Challenge!\n", " </h2>\n", " <p>\n", " If you scrape the links below grabbing the <p> tag with id=\"secret-word\", you'll discover a secret message :)\n", " </p>\n", " <div width=\"50%\">\n", " <div align=\"left\" class=\"block\">\n", " <ul>\n", " <li>\n", " <a href=\"challenge/file_1.html\">\n", " File 1\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_2.html\">\n", " File 2\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_3.html\">\n", " File 3\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_4.html\">\n", " File 4\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_5.html\">\n", " File 5\n", " </a>\n", " </li>\n", " </ul>\n", " </div>\n", " <div align=\"center\" class=\"block\">\n", " <ul>\n", " <li>\n", " <a href=\"challenge/file_6.html\">\n", " File 6\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_7.html\">\n", " File 7\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_8.html\">\n", " File 8\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_9.html\">\n", " File 9\n", " </a>\n", " </li>\n", " <li>\n", " <a href=\"challenge/file_10.html\">\n", " File 10\n", " </a>\n", " </li>\n", " </ul>\n", " </div>\n", " </div>\n", " <h2>\n", " Footnotes\n", " </h2>\n", " <p id=\"footer\">\n", " 1. This was actually a minivan that I named Debora. Maybe not my dream car, but I loved her nonetheless.\n", " </p>\n", " </body>\n", "</html>\n" ] } ], "source": [ "# Load the webpage content\n", "r = requests.get(\"https://keithgalli.github.io/web-scraping/webpage.html\")\n", "\n", "# Convert to a beautiful soup object\n", "wp = bs(r.content)\n", "\n", "# Print out our html\n", "print(wp.prettify())" ] }, { "cell_type": "markdown", "id": "40b80738", "metadata": {}, "source": [ "## Question 1: Grab all of the social links from the web page in 4 ways" ] }, { "cell_type": "markdown", "id": "192ba3e2", "metadata": {}, "source": [ "Link to the web page: https://keithgalli.github.io/web-scraping/webpage.html" ] }, { "cell_type": "markdown", "id": "9d71711d", "metadata": {}, "source": [ "### Method 1" ] }, { "cell_type": "code", "execution_count": 5, "id": "751d8c00", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://www.instagram.com/keithgalli/',\n", " 'https://twitter.com/keithgalli',\n", " 'https://www.linkedin.com/in/keithgalli/',\n", " 'https://www.tiktok.com/@keithgalli']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links = wp.select('ul.socials a')\n", "actual_links = [link['href'] for link in links]\n", "actual_links" ] }, { "cell_type": "markdown", "id": "31499b56", "metadata": {}, "source": [ "### Method 2" ] }, { "cell_type": "code", "execution_count": 19, "id": "0360ec41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://www.instagram.com/keithgalli/',\n", " 'https://twitter.com/keithgalli',\n", " 'https://www.linkedin.com/in/keithgalli/',\n", " 'https://www.tiktok.com/@keithgalli']" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ulist = wp.find('ul', attrs={'class': 'socials'})\n", "links = ulist.find_all(\n", " \"a\") #adding this step because find doesn't give the output as a list\n", "actual_links = [link['href'] for link in links]\n", "actual_links" ] }, { "cell_type": "markdown", "id": "f1add700", "metadata": {}, "source": [ "### Method 3" ] }, { "cell_type": "code", "execution_count": 22, "id": "edcb5393", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://www.instagram.com/keithgalli/',\n", " 'https://twitter.com/keithgalli',\n", " 'https://www.linkedin.com/in/keithgalli/',\n", " 'https://www.tiktok.com/@keithgalli']" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links = wp.select(\"li.social a\")\n", "actual_links = [link['href'] for link in links]\n", "actual_links" ] }, { "cell_type": "markdown", "id": "03cbef4a", "metadata": {}, "source": [ "### Method 4" ] }, { "cell_type": "code", "execution_count": 31, "id": "31f34d16", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['https://www.instagram.com/keithgalli/',\n", " 'https://twitter.com/keithgalli',\n", " 'https://www.linkedin.com/in/keithgalli/',\n", " 'https://www.tiktok.com/@keithgalli']" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "links = wp.select(\"body ul li.social a\")\n", "actual_links = [link['href'] for link in links]\n", "actual_links" ] }, { "cell_type": "markdown", "id": "eafed54f", "metadata": {}, "source": [ "## Scraping the MIT Hockey Stats table" ] }, { "cell_type": "code", "execution_count": 28, "id": "4771e50b", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 48, "id": "3f3482c0", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>S</th>\n", " <th>Team</th>\n", " <th>League</th>\n", " <th>GP</th>\n", " <th>G</th>\n", " <th>A</th>\n", " <th>TP</th>\n", " <th>PIM</th>\n", " <th>+/-</th>\n", " <th></th>\n", " <th>POST</th>\n", " <th>GP</th>\n", " <th>G</th>\n", " <th>A</th>\n", " <th>TP</th>\n", " <th>PIM</th>\n", " <th>+/-</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>2014-15</td>\n", " <td>MIT (Mass. Inst. of Tech.)</td>\n", " <td>ACHA II</td>\n", " <td>17</td>\n", " <td>3</td>\n", " <td>9</td>\n", " <td>12</td>\n", " <td>20</td>\n", " <td></td>\n", " <td>|</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2015-16</td>\n", " <td>MIT (Mass. Inst. of Tech.)</td>\n", " <td>ACHA II</td>\n", " <td>9</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>2</td>\n", " <td>2</td>\n", " <td></td>\n", " <td>|</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>2016-17</td>\n", " <td>MIT (Mass. Inst. of Tech.)</td>\n", " <td>ACHA II</td>\n", " <td>12</td>\n", " <td>5</td>\n", " <td>5</td>\n", " <td>10</td>\n", " <td>8</td>\n", " <td>0</td>\n", " <td>|</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2017-18</td>\n", " <td>Did not play</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td>|</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>2018-19</td>\n", " <td>MIT (Mass. Inst. of Tech.)</td>\n", " <td>ACHA III</td>\n", " <td>8</td>\n", " <td>5</td>\n", " <td>10</td>\n", " <td>15</td>\n", " <td>8</td>\n", " <td></td>\n", " <td>|</td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " <td></td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " S Team League GP G A TP PIM +/- \\\n", "0 2014-15 MIT (Mass. Inst. of Tech.) ACHA II 17 3 9 12 20 | \n", "1 2015-16 MIT (Mass. Inst. of Tech.) ACHA II 9 1 1 2 2 | \n", "2 2016-17 MIT (Mass. Inst. of Tech.) ACHA II 12 5 5 10 8 0 | \n", "3 2017-18 Did not play | \n", "4 2018-19 MIT (Mass. Inst. of Tech.) ACHA III 8 5 10 15 8 | \n", "\n", " POST GP G A TP PIM +/- \n", "0 \n", "1 \n", "2 \n", "3 \n", "4 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "table = wp.select('table.hockey-stats')[0]\n", "columns = table.find_all('th')\n", "column_names = [c.string for c in columns]\n", "\n", "table_rows = table.find('tbody').find_all('tr')\n", "\n", "l = [] #creating an empty list\n", "for tr in table_rows:\n", " td = tr.find_all('td')\n", " row = [str(tr.get_text()).strip() for tr in td]\n", " l.append(row)\n", "\n", "# print(l[0])\n", "\n", "df = pd.DataFrame(l, columns=column_names)\n", "df" ] }, { "cell_type": "markdown", "id": "7a4c3c04", "metadata": {}, "source": [ "## Grab all fun facts that contain the word 'is'" ] }, { "cell_type": "code", "execution_count": 61, "id": "3eb91400", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['Middle name is Ronald',\n", " 'Dunkin Donuts coffee is better than Starbucks',\n", " \"A favorite book series of mine is Ender's Game\",\n", " 'Current video game of choice is Rocket League',\n", " \"The band that I've seen the most times live is the Zac Brown Band\"]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "\n", "facts = wp.select('ul.fun-facts li')\n", "facts_with_is = [fact.find(string=re.compile('is')) for fact in facts]\n", "facts_with_is = [\n", " fact.find_parent().get_text() for fact in facts_with_is if fact\n", "]\n", "facts_with_is" ] }, { "cell_type": "markdown", "id": "1041b000", "metadata": {}, "source": [ "## Download an Image from a web page" ] }, { "cell_type": "code", "execution_count": 63, "id": "f4dbd182", "metadata": {}, "outputs": [], "source": [ "# Load the webpage content\n", "url = \"https://keithgalli.github.io/web-scraping/\"\n", "r = requests.get(url + \"webpage.html\")\n", "\n", "# Convert to a beautiful soup object\n", "webpage = bs(r.content)\n", "\n", "images = wp.select(\"div.row div.column img\")\n", "image_url = images[0]['src']\n", "full_url = url + image_url\n", "\n", "img_data = requests.get(full_url).content\n", "with open('lake_como.jpg', 'wb') as handler:\n", " handler.write(img_data)" ] }, { "cell_type": "markdown", "id": "874e04d5", "metadata": {}, "source": [ "**Image is Downloaded**" ] }, { "cell_type": "markdown", "id": "fb197803", "metadata": {}, "source": [ "## Solve the mystery challenge!" ] }, { "cell_type": "code", "execution_count": 78, "id": "803a3b24", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Make\n", "sure\n", "to\n", "smash\n", "that\n", "like\n", "button\n", "and\n", "subscribe\n", "!!!\n" ] } ], "source": [ "files = webpage.select(\"div.block a\")\n", "relative_files = [f['href'] for f in files]\n", "\n", "\n", "url = \"https://keithgalli.github.io/web-scraping/\"\n", "for f in relative_files:\n", " full_url = url + f\n", " page = requests.get(full_url)\n", " bs_page = bs(page.content)\n", " secret_word_element = bs_page.find(\"p\", attrs={\"id\": \"secret-word\"})\n", " secret_word = secret_word_element.string\n", " print(secret_word)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }