# Importing Necessary Libraries

In [1]:
import requests
from bs4 import BeautifulSoup as bs

## Loading our First Page

In [5]:
# Load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our html
print(soup.prettify())

<html>
 <head>
 <title>
 HTML Example
 </title>
 </head>
 <body>
 <div align="middle">
 <h1>
 HTML Webpage
 </h1>
 <p>
 Link to more interesting example:
 <a href="https://keithgalli.github.io/web-scraping/webpage.html">
 keithgalli.github.io/web-scraping/webpage.html
 </a>
 </p>
 </div>
 <h2>
 A Header
 </h2>
 <p>
 <i>
 Some italicized text
 </i>
 </p>
 <h2>
 Another header
 </h2>
 <p id="paragraph-id">
 <b>
 Some bold text
 </b>
 </p>
 </body>
</html>



## Start using BeautifulSoup to Scrape

In [8]:
first_header = soup.find('h2')
first_header

<h2>A Header</h2>

In [9]:
headers = soup.find_all('h2')
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [10]:
# Pass in a list of elements to look for
first_header = soup.find(["h1", "h2"])
first_header

<h1>HTML Webpage</h1>

In [11]:
first_header = soup.find(["h2", "h1"])
first_header

<h1>HTML Webpage</h1>

In [12]:
headers = soup.find_all(["h1", "h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [16]:
# You can pass in attributes to the find/find_all function
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [19]:
# You can nest find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [21]:
# We can search specific strings in our find/find_all calls

import re

para = soup.find_all('p', string=re.compile('Some'))
para

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [22]:
head = soup.find_all('h2', string=re.compile('(H|h)eader'))
head

[<h2>A Header</h2>, <h2>Another header</h2>]

## Select (CSS Selector)

In [24]:
content = soup.select('div p')
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [25]:
pg = soup.select('h2 ~ p')
pg

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [28]:
bold = soup.select('p#paragraph-id b')
bold

[<b>Some bold text</b>]

In [40]:
paras = soup.select('body > p')
print(paras)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [41]:
for para in paras:
 print(para.select("i"))

[<i>Some italicized text</i>]
[]


In [35]:
# Grab by element with specific property
soup.select("[align=middle]")

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

## Get different properties of the HTML

### Getting Strings from HTML

In [45]:
# use .string
soup.find('h2').string

'A Header'

In [49]:
# If multiple child elements use get_text
div = soup.find('div')
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



### Getting Links from HTML

In [50]:
# Get a specific property from an element
link = soup.find('a')
link['href']

'https://keithgalli.github.io/web-scraping/webpage.html'

### Subsetting to get what you want from HTML

In [51]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

## Code Navigation

In [61]:
# Know the terms: Parent, Sibling, Child
soup.body.find("div").find_parents()

[<body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>,
 <html>
 <head>
 <title>HTML Example</title>
 </head>
 <body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>
 <h2>A Header</h2>
 <p><i>Some italicized text</i></p>
 <h2>Another header</h2>
 <p id="paragraph-id"><b>Some bold text</b></p>
 </body>
 </html>,
 <html>
 <head>
 <title>HTML Example</title>
 </head>
 <body>
 <div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.gi

In [62]:
soup.body.find("div").find_parent()

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [63]:
soup.body.find("div").find_previous_siblings()

[]

In [64]:
soup.body.find("div").find_previous_sibling()

In [59]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [60]:
soup.body.find("div").find_next_sibling()

<h2>A Header</h2>

# Exercises

## Loading the webpage

In [2]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
wp = bs(r.content)

# Print out our html
print(wp.prettify())

<html>
 <head>
 <title>
 Keith Galli's Page
 </title>
 <style>
 table {
 border-collapse: collapse;
 }
 th {
 padding:5px;
 }
 td {
 border: 1px solid #ddd;
 padding: 5px;
 }
 tr:nth-child(even) {
 background-color: #f2f2f2;
 }
 th {
 padding-top: 12px;
 padding-bottom: 12px;
 text-align: left;
 background-color: #add8e6;
 color: black;
 }
 .block {
 width: 100px;
 /*float: left;*/
 display: inline-block;
 zoom: 1;
 }
 .column {
 float: left;
 height: 200px;
 /*width: 33.33%;*/
 padding: 5px;
 }

 .row::after {
 content: "";
 clear: both;
 display: table;
 }
 </style>
 </head>
 <body>
 <h1>
 Welcome to my page!
 </h1>
 <img src="./images/selfie1.jpg" width="300px"/>
 <h2>
 About me
 </h2>
 <p>
 Hi, my name is Keith and I am a YouTuber who focuses on content related to programming, data science, and machine learning!
 </p>
 <p>
 Here is a link to my channel:
 <a href="https://www.youtube.com/kgmit">
 youtube.com/kgmit
 </a>
 </p>
 <p>
 I grew up in the great state of New Hampshire here 

## Question 1: Grab all of the social links from the web page in 4 ways

Link to the web page: https://keithgalli.github.io/web-scraping/webpage.html

### Method 1

In [5]:
links = wp.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Method 2

In [19]:
ulist = wp.find('ul', attrs={'class': 'socials'})
links = ulist.find_all(
 "a") #adding this step because find doesn't give the output as a list
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Method 3

In [22]:
links = wp.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

### Method 4

In [31]:
links = wp.select("body ul li.social a")
actual_links = [link['href'] for link in links]
actual_links

['https://www.instagram.com/keithgalli/',
 'https://twitter.com/keithgalli',
 'https://www.linkedin.com/in/keithgalli/',
 'https://www.tiktok.com/@keithgalli']

## Scraping the MIT Hockey Stats table

In [28]:
import pandas as pd

In [48]:
table = wp.select('table.hockey-stats')[0]
columns = table.find_all('th')
column_names = [c.string for c in columns]

table_rows = table.find('tbody').find_all('tr')

l = [] #creating an empty list
for tr in table_rows:
 td = tr.find_all('td')
 row = [str(tr.get_text()).strip() for tr in td]
 l.append(row)

# print(l[0])

df = pd.DataFrame(l, columns=column_names)
df

Unnamed: 0,S,Team,League,GP,G,A,TP,PIM,+/-,Unnamed: 10,POST,GP.1,G.1,A.1,TP.1,PIM.1,+/-.1
0,2014-15,MIT (Mass. Inst. of Tech.),ACHA II,17.0,3.0,9.0,12.0,20.0,,|,,,,,,,
1,2015-16,MIT (Mass. Inst. of Tech.),ACHA II,9.0,1.0,1.0,2.0,2.0,,|,,,,,,,
2,2016-17,MIT (Mass. Inst. of Tech.),ACHA II,12.0,5.0,5.0,10.0,8.0,0.0,|,,,,,,,
3,2017-18,Did not play,,,,,,,,|,,,,,,,
4,2018-19,MIT (Mass. Inst. of Tech.),ACHA III,8.0,5.0,10.0,15.0,8.0,,|,,,,,,,


## Grab all fun facts that contain the word 'is'

In [61]:
import re

facts = wp.select('ul.fun-facts li')
facts_with_is = [fact.find(string=re.compile('is')) for fact in facts]
facts_with_is = [
 fact.find_parent().get_text() for fact in facts_with_is if fact
]
facts_with_is

['Middle name is Ronald',
 'Dunkin Donuts coffee is better than Starbucks',
 "A favorite book series of mine is Ender's Game",
 'Current video game of choice is Rocket League',
 "The band that I've seen the most times live is the Zac Brown Band"]

## Download an Image from a web page

In [63]:
# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/"
r = requests.get(url + "webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

images = wp.select("div.row div.column img")
image_url = images[0]['src']
full_url = url + image_url

img_data = requests.get(full_url).content
with open('lake_como.jpg', 'wb') as handler:
 handler.write(img_data)

**Image is Downloaded**

## Solve the mystery challenge!

In [78]:
files = webpage.select("div.block a")
relative_files = [f['href'] for f in files]


url = "https://keithgalli.github.io/web-scraping/"
for f in relative_files:
 full_url = url + f
 page = requests.get(full_url)
 bs_page = bs(page.content)
 secret_word_element = bs_page.find("p", attrs={"id": "secret-word"})
 secret_word = secret_word_element.string
 print(secret_word)

Make
sure
to
smash
that
like
button
and
subscribe
!!!
