{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "63d1c385",
   "metadata": {},
   "source": [
    "# Importing Necessary Libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a9d156f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup as bs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "45cbc274",
   "metadata": {},
   "source": [
    "## Loading our First Page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "07c70558",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html>\n",
      " <head>\n",
      "  <title>\n",
      "   HTML Example\n",
      "  </title>\n",
      " </head>\n",
      " <body>\n",
      "  <div align=\"middle\">\n",
      "   <h1>\n",
      "    HTML Webpage\n",
      "   </h1>\n",
      "   <p>\n",
      "    Link to more interesting example:\n",
      "    <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">\n",
      "     keithgalli.github.io/web-scraping/webpage.html\n",
      "    </a>\n",
      "   </p>\n",
      "  </div>\n",
      "  <h2>\n",
      "   A Header\n",
      "  </h2>\n",
      "  <p>\n",
      "   <i>\n",
      "    Some italicized text\n",
      "   </i>\n",
      "  </p>\n",
      "  <h2>\n",
      "   Another header\n",
      "  </h2>\n",
      "  <p id=\"paragraph-id\">\n",
      "   <b>\n",
      "    Some bold text\n",
      "   </b>\n",
      "  </p>\n",
      " </body>\n",
      "</html>\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Load the webpage content\n",
    "r = requests.get('https://keithgalli.github.io/web-scraping/example.html')\n",
    "\n",
    "# Convert to a beautiful soup object\n",
    "soup = bs(r.content)\n",
    "\n",
    "# Print out our html\n",
    "print(soup.prettify())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8b5ee527",
   "metadata": {},
   "source": [
    "## Start using BeautifulSoup to Scrape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "d00bfd42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<h2>A Header</h2>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_header = soup.find('h2')\n",
    "first_header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "719ef46f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<h2>A Header</h2>, <h2>Another header</h2>]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "headers = soup.find_all('h2')\n",
    "headers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e3e42496",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<h1>HTML Webpage</h1>"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Pass in a list of elements to look for\n",
    "first_header = soup.find([\"h1\", \"h2\"])\n",
    "first_header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "75836624",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<h1>HTML Webpage</h1>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first_header = soup.find([\"h2\", \"h1\"])\n",
    "first_header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "217c5c4a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "headers = soup.find_all([\"h1\", \"h2\"])\n",
    "headers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8b92bac4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<p id=\"paragraph-id\"><b>Some bold text</b></p>]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# You can pass in attributes to the find/find_all function\n",
    "paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})\n",
    "paragraph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "afc21ba4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<h1>HTML Webpage</h1>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# You can nest find/find_all calls\n",
    "body = soup.find('body')\n",
    "div = body.find('div')\n",
    "header = div.find('h1')\n",
    "header"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "a14114bd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<p><i>Some italicized text</i></p>,\n",
       " <p id=\"paragraph-id\"><b>Some bold text</b></p>]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# We can search specific strings in our find/find_all calls\n",
    "\n",
    "import re\n",
    "\n",
    "para = soup.find_all('p', string=re.compile('Some'))\n",
    "para"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "27fd9a5f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<h2>A Header</h2>, <h2>Another header</h2>]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "head = soup.find_all('h2', string=re.compile('(H|h)eader'))\n",
    "head"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3b0969d2",
   "metadata": {},
   "source": [
    "## Select (CSS Selector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "6e739102",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "content = soup.select('div p')\n",
    "content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "ed6a8300",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<p><i>Some italicized text</i></p>,\n",
       " <p id=\"paragraph-id\"><b>Some bold text</b></p>]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pg = soup.select('h2 ~ p')\n",
    "pg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "074f3d9b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<b>Some bold text</b>]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "bold = soup.select('p#paragraph-id b')\n",
    "bold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "f26127d4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<p><i>Some italicized text</i></p>, <p id=\"paragraph-id\"><b>Some bold text</b></p>]\n"
     ]
    }
   ],
   "source": [
    "paras = soup.select('body > p')\n",
    "print(paras)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "a7083144",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[<i>Some italicized text</i>]\n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "for para in paras:\n",
    "    print(para.select(\"i\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "8176ba6b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<div align=\"middle\">\n",
       " <h1>HTML Webpage</h1>\n",
       " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n",
       " </div>]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Grab by element with specific property\n",
    "soup.select(\"[align=middle]\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4c641def",
   "metadata": {},
   "source": [
    "## Get different properties of the HTML"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "40e797ea",
   "metadata": {},
   "source": [
    "### Getting Strings from HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "6aa807ce",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'A Header'"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# use .string\n",
    "soup.find('h2').string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "48a94d75",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "HTML Webpage\n",
      "Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# If multiple child elements use get_text\n",
    "div = soup.find('div')\n",
    "print(div.get_text())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ce848c7",
   "metadata": {},
   "source": [
    "### Getting Links from HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "205c4bdf",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://keithgalli.github.io/web-scraping/webpage.html'"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get a specific property from an element\n",
    "link = soup.find('a')\n",
    "link['href']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1bac422",
   "metadata": {},
   "source": [
    "### Subsetting to get what you want from HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "39f5689d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'paragraph-id'"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paragraphs = soup.select(\"p#paragraph-id\")\n",
    "paragraphs[0]['id']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b46c858c",
   "metadata": {},
   "source": [
    "## Code Navigation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "64b2fb25",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<body>\n",
       " <div align=\"middle\">\n",
       " <h1>HTML Webpage</h1>\n",
       " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n",
       " </div>\n",
       " <h2>A Header</h2>\n",
       " <p><i>Some italicized text</i></p>\n",
       " <h2>Another header</h2>\n",
       " <p id=\"paragraph-id\"><b>Some bold text</b></p>\n",
       " </body>,\n",
       " <html>\n",
       " <head>\n",
       " <title>HTML Example</title>\n",
       " </head>\n",
       " <body>\n",
       " <div align=\"middle\">\n",
       " <h1>HTML Webpage</h1>\n",
       " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n",
       " </div>\n",
       " <h2>A Header</h2>\n",
       " <p><i>Some italicized text</i></p>\n",
       " <h2>Another header</h2>\n",
       " <p id=\"paragraph-id\"><b>Some bold text</b></p>\n",
       " </body>\n",
       " </html>,\n",
       " <html>\n",
       " <head>\n",
       " <title>HTML Example</title>\n",
       " </head>\n",
       " <body>\n",
       " <div align=\"middle\">\n",
       " <h1>HTML Webpage</h1>\n",
       " <p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n",
       " </div>\n",
       " <h2>A Header</h2>\n",
       " <p><i>Some italicized text</i></p>\n",
       " <h2>Another header</h2>\n",
       " <p id=\"paragraph-id\"><b>Some bold text</b></p>\n",
       " </body>\n",
       " </html>]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Know the terms: Parent, Sibling, Child\n",
    "soup.body.find(\"div\").find_parents()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "fbaf64b1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<body>\n",
       "<div align=\"middle\">\n",
       "<h1>HTML Webpage</h1>\n",
       "<p>Link to more interesting example: <a href=\"https://keithgalli.github.io/web-scraping/webpage.html\">keithgalli.github.io/web-scraping/webpage.html</a></p>\n",
       "</div>\n",
       "<h2>A Header</h2>\n",
       "<p><i>Some italicized text</i></p>\n",
       "<h2>Another header</h2>\n",
       "<p id=\"paragraph-id\"><b>Some bold text</b></p>\n",
       "</body>"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.body.find(\"div\").find_parent()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "c4627f9d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.body.find(\"div\").find_previous_siblings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "f6c923d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "soup.body.find(\"div\").find_previous_sibling()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "4ce37173",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<h2>A Header</h2>,\n",
       " <p><i>Some italicized text</i></p>,\n",
       " <h2>Another header</h2>,\n",
       " <p id=\"paragraph-id\"><b>Some bold text</b></p>]"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.body.find(\"div\").find_next_siblings()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "6135bb51",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<h2>A Header</h2>"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soup.body.find(\"div\").find_next_sibling()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b1aea48",
   "metadata": {},
   "source": [
    "# Exercises"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "811b3fba",
   "metadata": {},
   "source": [
    "## Loading the webpage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ae0a6a30",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<html>\n",
      " <head>\n",
      "  <title>\n",
      "   Keith Galli's Page\n",
      "  </title>\n",
      "  <style>\n",
      "   table {\n",
      "    border-collapse: collapse;\n",
      "  }\n",
      "  th {\n",
      "    padding:5px;\n",
      "  }\n",
      "  td {\n",
      "    border: 1px solid #ddd;\n",
      "    padding: 5px;\n",
      "  }\n",
      "  tr:nth-child(even) {\n",
      "    background-color: #f2f2f2;\n",
      "  }\n",
      "  th {\n",
      "    padding-top: 12px;\n",
      "    padding-bottom: 12px;\n",
      "    text-align: left;\n",
      "    background-color: #add8e6;\n",
      "    color: black;\n",
      "  }\n",
      "  .block {\n",
      "  width: 100px;\n",
      "  /*float: left;*/\n",
      "    display: inline-block;\n",
      "    zoom: 1;\n",
      "  }\n",
      "  .column {\n",
      "  float: left;\n",
      "  height: 200px;\n",
      "  /*width: 33.33%;*/\n",
      "  padding: 5px;\n",
      "  }\n",
      "\n",
      "  .row::after {\n",
      "    content: \"\";\n",
      "    clear: both;\n",
      "    display: table;\n",
      "  }\n",
      "  </style>\n",
      " </head>\n",
      " <body>\n",
      "  <h1>\n",
      "   Welcome to my page!\n",
      "  </h1>\n",
      "  <img src=\"./images/selfie1.jpg\" width=\"300px\"/>\n",
      "  <h2>\n",
      "   About me\n",
      "  </h2>\n",
      "  <p>\n",
      "   Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!\n",
      "  </p>\n",
      "  <p>\n",
      "   Here is a link to my channel:\n",
      "   <a href=\"https://www.youtube.com/kgmit\">\n",
      "    youtube.com/kgmit\n",
      "   </a>\n",
      "  </p>\n",
      "  <p>\n",
      "   I grew up in the great state of New Hampshire here in the USA. From an early age I always loved math. Around my senior year of high school, my brother first introduced me to programming. I found it a creative way to apply the same type of logical thinking skills that I enjoyed with math. This influenced me to study computer science in college and ultimately create a YouTube channel to share some things that I have learned along the way.\n",
      "  </p>\n",
      "  <h3>\n",
      "   Hobbies\n",
      "  </h3>\n",
      "  <p>\n",
      "   Believe it or not, I don't code 24/7. I love doing all sorts of active things. I like to play ice hockey &amp; table tennis as well as run, hike, skateboard, and snowboard. In addition to sports, I am a board game enthusiast. The two that I've been playing the most recently are\n",
      "   <i>\n",
      "    Settlers of Catan\n",
      "   </i>\n",
      "   and\n",
      "   <i>\n",
      "    Othello\n",
      "   </i>\n",
      "   .\n",
      "  </p>\n",
      "  <h3>\n",
      "   Fun Facts\n",
      "  </h3>\n",
      "  <ul class=\"fun-facts\">\n",
      "   <li>\n",
      "    Owned my dream car in high school\n",
      "    <a href=\"#footer\">\n",
      "     <sup>\n",
      "      1\n",
      "     </sup>\n",
      "    </a>\n",
      "   </li>\n",
      "   <li>\n",
      "    Middle name is Ronald\n",
      "   </li>\n",
      "   <li>\n",
      "    Never had been on a plane until college\n",
      "   </li>\n",
      "   <li>\n",
      "    Dunkin Donuts coffee is better than Starbucks\n",
      "   </li>\n",
      "   <li>\n",
      "    A favorite book series of mine is\n",
      "    <i>\n",
      "     Ender's Game\n",
      "    </i>\n",
      "   </li>\n",
      "   <li>\n",
      "    Current video game of choice is\n",
      "    <i>\n",
      "     Rocket League\n",
      "    </i>\n",
      "   </li>\n",
      "   <li>\n",
      "    The band that I've seen the most times live is the\n",
      "    <i>\n",
      "     Zac Brown Band\n",
      "    </i>\n",
      "   </li>\n",
      "  </ul>\n",
      "  <h2>\n",
      "   Social Media\n",
      "  </h2>\n",
      "  I encourage you to check out my content on all social media platforms\n",
      "  <br/>\n",
      "  <ul class=\"socials\">\n",
      "   <li class=\"social instagram\">\n",
      "    <b>\n",
      "     Instagram:\n",
      "    </b>\n",
      "    <a href=\"https://www.instagram.com/keithgalli/\">\n",
      "     https://www.instagram.com/keithgalli/\n",
      "    </a>\n",
      "   </li>\n",
      "   <li class=\"social twitter\">\n",
      "    <b>\n",
      "     Twitter:\n",
      "    </b>\n",
      "    <a href=\"https://twitter.com/keithgalli\">\n",
      "     https://twitter.com/keithgalli\n",
      "    </a>\n",
      "   </li>\n",
      "   <li class=\"social linkedin\">\n",
      "    <b>\n",
      "     LinkedIn:\n",
      "    </b>\n",
      "    <a href=\"https://www.linkedin.com/in/keithgalli/\">\n",
      "     https://www.linkedin.com/in/keithgalli/\n",
      "    </a>\n",
      "   </li>\n",
      "   <li class=\"social tiktok\">\n",
      "    <b>\n",
      "     TikTok:\n",
      "    </b>\n",
      "    <a href=\"https://www.tiktok.com/@keithgalli\">\n",
      "     https://www.tiktok.com/@keithgalli\n",
      "    </a>\n",
      "   </li>\n",
      "  </ul>\n",
      "  <h2>\n",
      "   Photos\n",
      "  </h2>\n",
      "  Here are a few photos from a trip to italy I took last year\n",
      "  <div class=\"row\">\n",
      "   <div class=\"column\">\n",
      "    <img alt=\"Lake Como\" src=\"images/italy/lake_como.jpg\" style=\"height:100%\"/>\n",
      "   </div>\n",
      "   <div class=\"column\">\n",
      "    <img alt=\"Pontevecchio, Florence\" src=\"images/italy/pontevecchio.jpg\" style=\"height:100%\"/>\n",
      "   </div>\n",
      "   <div class=\"column\">\n",
      "    <img alt=\"Riomaggiore, Cinque de Terre\" src=\"images/italy/riomaggiore.jpg\" style=\"height:100%\"/>\n",
      "   </div>\n",
      "  </div>\n",
      "  <div>\n",
      "  </div>\n",
      "  <h2>\n",
      "   Table\n",
      "  </h2>\n",
      "  My MIT hockey stats :)\n",
      "  <br/>\n",
      "  <table class=\"hockey-stats\">\n",
      "   <thead>\n",
      "    <tr>\n",
      "     <th class=\"season\" data-sort=\"\">\n",
      "      S\n",
      "     </th>\n",
      "     <th class=\"team\" data-sort=\"team\">\n",
      "      Team\n",
      "     </th>\n",
      "     <th class=\"league\" data-sort=\"league\">\n",
      "      League\n",
      "     </th>\n",
      "     <th class=\"regular gp\" data-sort=\"gp\">\n",
      "      GP\n",
      "     </th>\n",
      "     <th class=\"regular g\" data-sort=\"g\">\n",
      "      G\n",
      "     </th>\n",
      "     <th class=\"regular a\" data-sort=\"a\">\n",
      "      A\n",
      "     </th>\n",
      "     <th class=\"regular tp\" data-sort=\"tp\">\n",
      "      TP\n",
      "     </th>\n",
      "     <th class=\"regular pim\" data-sort=\"pim\">\n",
      "      PIM\n",
      "     </th>\n",
      "     <th class=\"regular pm\" data-sort=\"pm\">\n",
      "      +/-\n",
      "     </th>\n",
      "     <th class=\"separator\">\n",
      "     </th>\n",
      "     <th class=\"postseason\">\n",
      "      POST\n",
      "     </th>\n",
      "     <th class=\"postseason gp\" data-sort=\"playoffs-gp\">\n",
      "      GP\n",
      "     </th>\n",
      "     <th class=\"postseason g\" data-sort=\"playoffs-g\">\n",
      "      G\n",
      "     </th>\n",
      "     <th class=\"postseason a\" data-sort=\"playoffs-a\">\n",
      "      A\n",
      "     </th>\n",
      "     <th class=\"postseason tp\" data-sort=\"playoffs-tp\">\n",
      "      TP\n",
      "     </th>\n",
      "     <th class=\"postseason pim\" data-sort=\"playoffs-pim\">\n",
      "      PIM\n",
      "     </th>\n",
      "     <th class=\"postseason pm\" data-sort=\"playoffs-pm\">\n",
      "      +/-\n",
      "     </th>\n",
      "    </tr>\n",
      "   </thead>\n",
      "   <tbody>\n",
      "    <tr class=\"team-continent-NA\">\n",
      "     <td class=\"season sorted\">\n",
      "      2014-15\n",
      "     </td>\n",
      "     <td class=\"team\">\n",
      "      <i>\n",
      "       <img src=\"images/flag.png\"/>\n",
      "      </i>\n",
      "      <span class=\"txt-blue\">\n",
      "       <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2014-2015?tab=stats\">\n",
      "        MIT (Mass. Inst. of Tech.)\n",
      "       </a>\n",
      "      </span>\n",
      "     </td>\n",
      "     <td class=\"league\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2014-2015\">\n",
      "       ACHA II\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"regular gp\">\n",
      "      17\n",
      "     </td>\n",
      "     <td class=\"regular g\">\n",
      "      3\n",
      "     </td>\n",
      "     <td class=\"regular a\">\n",
      "      9\n",
      "     </td>\n",
      "     <td class=\"regular tp\">\n",
      "      12\n",
      "     </td>\n",
      "     <td class=\"regular pim\">\n",
      "      20\n",
      "     </td>\n",
      "     <td class=\"regular pm\">\n",
      "     </td>\n",
      "     <td class=\"separator\">\n",
      "      |\n",
      "     </td>\n",
      "     <td class=\"postseason\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2014-2015\">\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"postseason gp\">\n",
      "     </td>\n",
      "     <td class=\"postseason g\">\n",
      "     </td>\n",
      "     <td class=\"postseason a\">\n",
      "     </td>\n",
      "     <td class=\"postseason tp\">\n",
      "     </td>\n",
      "     <td class=\"postseason pim\">\n",
      "     </td>\n",
      "     <td class=\"postseason pm\">\n",
      "     </td>\n",
      "    </tr>\n",
      "    <tr class=\"team-continent-NA\">\n",
      "     <td class=\"season sorted\">\n",
      "      2015-16\n",
      "     </td>\n",
      "     <td class=\"team\">\n",
      "      <i>\n",
      "       <img src=\"images/flag.png\"/>\n",
      "      </i>\n",
      "      <span class=\"txt-blue\">\n",
      "       <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2015-2016?tab=stats\">\n",
      "        MIT (Mass. Inst. of Tech.)\n",
      "       </a>\n",
      "      </span>\n",
      "     </td>\n",
      "     <td class=\"league\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2015-2016\">\n",
      "       ACHA II\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"regular gp\">\n",
      "      9\n",
      "     </td>\n",
      "     <td class=\"regular g\">\n",
      "      1\n",
      "     </td>\n",
      "     <td class=\"regular a\">\n",
      "      1\n",
      "     </td>\n",
      "     <td class=\"regular tp\">\n",
      "      2\n",
      "     </td>\n",
      "     <td class=\"regular pim\">\n",
      "      2\n",
      "     </td>\n",
      "     <td class=\"regular pm\">\n",
      "     </td>\n",
      "     <td class=\"separator\">\n",
      "      |\n",
      "     </td>\n",
      "     <td class=\"postseason\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2015-2016\">\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"postseason gp\">\n",
      "     </td>\n",
      "     <td class=\"postseason g\">\n",
      "     </td>\n",
      "     <td class=\"postseason a\">\n",
      "     </td>\n",
      "     <td class=\"postseason tp\">\n",
      "     </td>\n",
      "     <td class=\"postseason pim\">\n",
      "     </td>\n",
      "     <td class=\"postseason pm\">\n",
      "     </td>\n",
      "    </tr>\n",
      "    <tr class=\"team-continent-NA\">\n",
      "     <td class=\"season sorted\">\n",
      "      2016-17\n",
      "     </td>\n",
      "     <td class=\"team\">\n",
      "      <i>\n",
      "       <img src=\"images/flag.png\"/>\n",
      "      </i>\n",
      "      <span class=\"txt-blue\">\n",
      "       <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2016-2017?tab=stats\">\n",
      "        MIT (Mass. Inst. of Tech.)\n",
      "       </a>\n",
      "      </span>\n",
      "     </td>\n",
      "     <td class=\"league\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-ii/stats/2016-2017\">\n",
      "       ACHA II\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"regular gp\">\n",
      "      12\n",
      "     </td>\n",
      "     <td class=\"regular g\">\n",
      "      5\n",
      "     </td>\n",
      "     <td class=\"regular a\">\n",
      "      5\n",
      "     </td>\n",
      "     <td class=\"regular tp\">\n",
      "      10\n",
      "     </td>\n",
      "     <td class=\"regular pim\">\n",
      "      8\n",
      "     </td>\n",
      "     <td class=\"regular pm\">\n",
      "      0\n",
      "     </td>\n",
      "     <td class=\"separator\">\n",
      "      |\n",
      "     </td>\n",
      "     <td class=\"postseason\">\n",
      "     </td>\n",
      "     <td class=\"postseason gp\">\n",
      "     </td>\n",
      "     <td class=\"postseason g\">\n",
      "     </td>\n",
      "     <td class=\"postseason a\">\n",
      "     </td>\n",
      "     <td class=\"postseason tp\">\n",
      "     </td>\n",
      "     <td class=\"postseason pim\">\n",
      "     </td>\n",
      "     <td class=\"postseason pm\">\n",
      "     </td>\n",
      "    </tr>\n",
      "    <tr class=\"team-continent-EU\">\n",
      "     <td class=\"season sorted\">\n",
      "      2017-18\n",
      "     </td>\n",
      "     <td class=\"team\">\n",
      "      Did not play\n",
      "     </td>\n",
      "     <td class=\"league\">\n",
      "      <a href=\"https://www.eliteprospects.com/stats\">\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"regular gp\">\n",
      "     </td>\n",
      "     <td class=\"regular g\">\n",
      "     </td>\n",
      "     <td class=\"regular a\">\n",
      "     </td>\n",
      "     <td class=\"regular tp\">\n",
      "     </td>\n",
      "     <td class=\"regular pim\">\n",
      "     </td>\n",
      "     <td class=\"regular pm\">\n",
      "     </td>\n",
      "     <td class=\"separator\">\n",
      "      |\n",
      "     </td>\n",
      "     <td class=\"postseason\">\n",
      "      <a href=\"https://www.eliteprospects.com/stats\">\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"postseason gp\">\n",
      "     </td>\n",
      "     <td class=\"postseason g\">\n",
      "     </td>\n",
      "     <td class=\"postseason a\">\n",
      "     </td>\n",
      "     <td class=\"postseason tp\">\n",
      "     </td>\n",
      "     <td class=\"postseason pim\">\n",
      "     </td>\n",
      "     <td class=\"postseason pm\">\n",
      "     </td>\n",
      "    </tr>\n",
      "    <tr class=\"team-continent-NA\">\n",
      "     <td class=\"season sorted\">\n",
      "      2018-19\n",
      "     </td>\n",
      "     <td class=\"team\">\n",
      "      <i>\n",
      "       <img src=\"images/flag.png\"/>\n",
      "      </i>\n",
      "      <span class=\"txt-blue\">\n",
      "       <a href=\"https://www.eliteprospects.com/team/10263/mit-mass.-inst.-of-tech./2018-2019?tab=stats\">\n",
      "        MIT (Mass. Inst. of Tech.)\n",
      "       </a>\n",
      "      </span>\n",
      "     </td>\n",
      "     <td class=\"league\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-iii/stats/2018-2019\">\n",
      "       ACHA III\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"regular gp\">\n",
      "      8\n",
      "     </td>\n",
      "     <td class=\"regular g\">\n",
      "      5\n",
      "     </td>\n",
      "     <td class=\"regular a\">\n",
      "      10\n",
      "     </td>\n",
      "     <td class=\"regular tp\">\n",
      "      15\n",
      "     </td>\n",
      "     <td class=\"regular pim\">\n",
      "      8\n",
      "     </td>\n",
      "     <td class=\"regular pm\">\n",
      "     </td>\n",
      "     <td class=\"separator\">\n",
      "      |\n",
      "     </td>\n",
      "     <td class=\"postseason\">\n",
      "      <a href=\"https://www.eliteprospects.com/league/acha-iii/stats/2018-2019\">\n",
      "      </a>\n",
      "     </td>\n",
      "     <td class=\"postseason gp\">\n",
      "     </td>\n",
      "     <td class=\"postseason g\">\n",
      "     </td>\n",
      "     <td class=\"postseason a\">\n",
      "     </td>\n",
      "     <td class=\"postseason tp\">\n",
      "     </td>\n",
      "     <td class=\"postseason pim\">\n",
      "     </td>\n",
      "     <td class=\"postseason pm\">\n",
      "     </td>\n",
      "    </tr>\n",
      "   </tbody>\n",
      "  </table>\n",
      "  <h2>\n",
      "   Mystery Message Challenge!\n",
      "  </h2>\n",
      "  <p>\n",
      "   If you scrape the links below grabbing the &lt;p&gt; tag with id=\"secret-word\", you'll discover a secret message :)\n",
      "  </p>\n",
      "  <div width=\"50%\">\n",
      "   <div align=\"left\" class=\"block\">\n",
      "    <ul>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_1.html\">\n",
      "       File 1\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_2.html\">\n",
      "       File 2\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_3.html\">\n",
      "       File 3\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_4.html\">\n",
      "       File 4\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_5.html\">\n",
      "       File 5\n",
      "      </a>\n",
      "     </li>\n",
      "    </ul>\n",
      "   </div>\n",
      "   <div align=\"center\" class=\"block\">\n",
      "    <ul>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_6.html\">\n",
      "       File 6\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_7.html\">\n",
      "       File 7\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_8.html\">\n",
      "       File 8\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_9.html\">\n",
      "       File 9\n",
      "      </a>\n",
      "     </li>\n",
      "     <li>\n",
      "      <a href=\"challenge/file_10.html\">\n",
      "       File 10\n",
      "      </a>\n",
      "     </li>\n",
      "    </ul>\n",
      "   </div>\n",
      "  </div>\n",
      "  <h2>\n",
      "   Footnotes\n",
      "  </h2>\n",
      "  <p id=\"footer\">\n",
      "   1. This was actually a minivan that I named Debora. Maybe not my dream car, but I loved her nonetheless.\n",
      "  </p>\n",
      " </body>\n",
      "</html>\n"
     ]
    }
   ],
   "source": [
    "# Load the webpage content\n",
    "r = requests.get(\"https://keithgalli.github.io/web-scraping/webpage.html\")\n",
    "\n",
    "# Convert to a beautiful soup object\n",
    "wp = bs(r.content)\n",
    "\n",
    "# Print out our html\n",
    "print(wp.prettify())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "40b80738",
   "metadata": {},
   "source": [
    "## Question 1: Grab all of the social links from the web page in 4 ways"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "192ba3e2",
   "metadata": {},
   "source": [
    "Link to the web page: https://keithgalli.github.io/web-scraping/webpage.html"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d71711d",
   "metadata": {},
   "source": [
    "### Method 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "751d8c00",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.instagram.com/keithgalli/',\n",
       " 'https://twitter.com/keithgalli',\n",
       " 'https://www.linkedin.com/in/keithgalli/',\n",
       " 'https://www.tiktok.com/@keithgalli']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links = wp.select('ul.socials a')\n",
    "actual_links = [link['href'] for link in links]\n",
    "actual_links"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31499b56",
   "metadata": {},
   "source": [
    "### Method 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0360ec41",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.instagram.com/keithgalli/',\n",
       " 'https://twitter.com/keithgalli',\n",
       " 'https://www.linkedin.com/in/keithgalli/',\n",
       " 'https://www.tiktok.com/@keithgalli']"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ulist = wp.find('ul', attrs={'class': 'socials'})\n",
    "links = ulist.find_all(\n",
    "    \"a\")  #adding this step because find doesn't give the output as a list\n",
    "actual_links = [link['href'] for link in links]\n",
    "actual_links"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f1add700",
   "metadata": {},
   "source": [
    "### Method 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "edcb5393",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.instagram.com/keithgalli/',\n",
       " 'https://twitter.com/keithgalli',\n",
       " 'https://www.linkedin.com/in/keithgalli/',\n",
       " 'https://www.tiktok.com/@keithgalli']"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links = wp.select(\"li.social a\")\n",
    "actual_links = [link['href'] for link in links]\n",
    "actual_links"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "03cbef4a",
   "metadata": {},
   "source": [
    "### Method 4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "31f34d16",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['https://www.instagram.com/keithgalli/',\n",
       " 'https://twitter.com/keithgalli',\n",
       " 'https://www.linkedin.com/in/keithgalli/',\n",
       " 'https://www.tiktok.com/@keithgalli']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "links = wp.select(\"body ul li.social a\")\n",
    "actual_links = [link['href'] for link in links]\n",
    "actual_links"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eafed54f",
   "metadata": {},
   "source": [
    "## Scraping the MIT Hockey Stats table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "4771e50b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "3f3482c0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>S</th>\n",
       "      <th>Team</th>\n",
       "      <th>League</th>\n",
       "      <th>GP</th>\n",
       "      <th>G</th>\n",
       "      <th>A</th>\n",
       "      <th>TP</th>\n",
       "      <th>PIM</th>\n",
       "      <th>+/-</th>\n",
       "      <th></th>\n",
       "      <th>POST</th>\n",
       "      <th>GP</th>\n",
       "      <th>G</th>\n",
       "      <th>A</th>\n",
       "      <th>TP</th>\n",
       "      <th>PIM</th>\n",
       "      <th>+/-</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2014-15</td>\n",
       "      <td>MIT (Mass. Inst. of Tech.)</td>\n",
       "      <td>ACHA II</td>\n",
       "      <td>17</td>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "      <td>12</td>\n",
       "      <td>20</td>\n",
       "      <td></td>\n",
       "      <td>|</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2015-16</td>\n",
       "      <td>MIT (Mass. Inst. of Tech.)</td>\n",
       "      <td>ACHA II</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td></td>\n",
       "      <td>|</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2016-17</td>\n",
       "      <td>MIT (Mass. Inst. of Tech.)</td>\n",
       "      <td>ACHA II</td>\n",
       "      <td>12</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>8</td>\n",
       "      <td>0</td>\n",
       "      <td>|</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2017-18</td>\n",
       "      <td>Did not play</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td>|</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2018-19</td>\n",
       "      <td>MIT (Mass. Inst. of Tech.)</td>\n",
       "      <td>ACHA III</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>15</td>\n",
       "      <td>8</td>\n",
       "      <td></td>\n",
       "      <td>|</td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "      <td></td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "         S                        Team    League  GP  G   A  TP PIM +/-     \\\n",
       "0  2014-15  MIT (Mass. Inst. of Tech.)   ACHA II  17  3   9  12  20      |   \n",
       "1  2015-16  MIT (Mass. Inst. of Tech.)   ACHA II   9  1   1   2   2      |   \n",
       "2  2016-17  MIT (Mass. Inst. of Tech.)   ACHA II  12  5   5  10   8   0  |   \n",
       "3  2017-18                Did not play                                   |   \n",
       "4  2018-19  MIT (Mass. Inst. of Tech.)  ACHA III   8  5  10  15   8      |   \n",
       "\n",
       "  POST GP G A TP PIM +/-  \n",
       "0                         \n",
       "1                         \n",
       "2                         \n",
       "3                         \n",
       "4                         "
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "table = wp.select('table.hockey-stats')[0]\n",
    "columns = table.find_all('th')\n",
    "column_names = [c.string for c in columns]\n",
    "\n",
    "table_rows = table.find('tbody').find_all('tr')\n",
    "\n",
    "l = []  #creating an empty list\n",
    "for tr in table_rows:\n",
    "    td = tr.find_all('td')\n",
    "    row = [str(tr.get_text()).strip() for tr in td]\n",
    "    l.append(row)\n",
    "\n",
    "# print(l[0])\n",
    "\n",
    "df = pd.DataFrame(l, columns=column_names)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7a4c3c04",
   "metadata": {},
   "source": [
    "## Grab all fun facts that contain the word 'is'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "3eb91400",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Middle name is Ronald',\n",
       " 'Dunkin Donuts coffee is better than Starbucks',\n",
       " \"A favorite book series of mine is Ender's Game\",\n",
       " 'Current video game of choice is Rocket League',\n",
       " \"The band that I've seen the most times live is the Zac Brown Band\"]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re\n",
    "\n",
    "facts = wp.select('ul.fun-facts li')\n",
    "facts_with_is = [fact.find(string=re.compile('is')) for fact in facts]\n",
    "facts_with_is = [\n",
    "    fact.find_parent().get_text() for fact in facts_with_is if fact\n",
    "]\n",
    "facts_with_is"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1041b000",
   "metadata": {},
   "source": [
    "## Download an Image from a web page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "f4dbd182",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the webpage content\n",
    "url = \"https://keithgalli.github.io/web-scraping/\"\n",
    "r = requests.get(url + \"webpage.html\")\n",
    "\n",
    "# Convert to a beautiful soup object\n",
    "webpage = bs(r.content)\n",
    "\n",
    "images = wp.select(\"div.row div.column img\")\n",
    "image_url = images[0]['src']\n",
    "full_url = url + image_url\n",
    "\n",
    "img_data = requests.get(full_url).content\n",
    "with open('lake_como.jpg', 'wb') as handler:\n",
    "    handler.write(img_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "874e04d5",
   "metadata": {},
   "source": [
    "**Image is Downloaded**"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fb197803",
   "metadata": {},
   "source": [
    "## Solve the mystery challenge!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "803a3b24",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Make\n",
      "sure\n",
      "to\n",
      "smash\n",
      "that\n",
      "like\n",
      "button\n",
      "and\n",
      "subscribe\n",
      "!!!\n"
     ]
    }
   ],
   "source": [
    "files = webpage.select(\"div.block a\")\n",
    "relative_files = [f['href'] for f in files]\n",
    "\n",
    "\n",
    "url = \"https://keithgalli.github.io/web-scraping/\"\n",
    "for f in relative_files:\n",
    "    full_url = url + f\n",
    "    page = requests.get(full_url)\n",
    "    bs_page = bs(page.content)\n",
    "    secret_word_element = bs_page.find(\"p\", attrs={\"id\": \"secret-word\"})\n",
    "    secret_word = secret_word_element.string\n",
    "    print(secret_word)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}