<html>
<head>
<title>Title</title>
<link rel="icon" type="icon" href="http://a" />
<link rel="icon" type="icon" href="http://b" />
<script src="https://c.js"></script>
</head>
<body>
<div>
<p>Click <b>here</b> now.</p>
<span>Frozen</span>
</div>
<table style="width:100%">
<tr>
<td>Kristen</td>
<td>Bell</td>
</tr>
<tr>
<td>Idina</td>
<td>Menzel</td>
</tr>
</table>
<img src="http://ia.media-imdb.com/images.png"/>
</body>
</html>
<a href="http://github.com">GitHub</a>
<a></a>
- tag namehref
- attribute (name)"http://github.com"
- attribute (value)GitHub
- contenthtml
head
title
link
link
script
body
div
p
b
span
table
tr
td
td
tr
td
td
img
html
head
title
link
link
script
body
div
p
b
span
table
tr
td
td
tr
td
td
img
span {
color: #ffffff;
}
.num {
color: #a8660d;
}
table.data {
width: auto;
}
#firstname {
background-color: yellow;
}
<span class="bigname" id="shiny">Shiny</span>
<span></span>
- tag namebigname
- class (optional)shiny
- id (optional)span
.bigname
span.bigname
#shiny
Prefix | Matches |
---|---|
none | tag |
. | class |
# | id |
rvest
read_html()
html_nodes()
library(rvest)
frozen <- read_html("http://www.imdb.com/title/tt2294629/")
frozen
## {xml_document}
## <html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset= ...
## [2] <body id="styleguide-v2" class="fixed">\n<script>\n if (typeof ue ...
itals <- html_nodes(frozen, "em")
itals
## {xml_nodeset (1)}
## [1] <em class="nobr">Written by\n<a href="/search/title?plot_author=DeAl ...
itals
## {xml_nodeset (1)}
## [1] <em class="nobr">Written by\n<a href="/search/title?plot_author=DeAl ...
html_text(itals)
## [1] "Written by\nDeAlan Wilson for ComedyE.com"
html_name(itals)
## [1] "em"
html_children(itals)
## {xml_nodeset (1)}
## [1] <a href="/search/title?plot_author=DeAlan%20Wilson%20for%20ComedyE.c ...
html_attr(itals, "class")
## [1] "nobr"
html_attrs(itals)
## [[1]]
## class
## "nobr"
span
s and class = "itemprop"
library(rvest)
frozen <- read_html("http://www.imdb.com/title/tt2294629/")
cast <- html_nodes(frozen, "span.itemprop")
html_text(cast)
## [1] "Animation" "Adventure"
## [3] "Comedy" "Chris Buck"
## [5] "Jennifer Lee" "Jennifer Lee"
## [7] "Hans Christian Andersen" "Kristen Bell"
## [9] "Idina Menzel" "Jonathan Groff"
## [11] "Kristen Bell" "Idina Menzel"
## [13] "Jonathan Groff" "Josh Gad"
## [15] "Santino Fontana" "Alan Tudyk"
## [17] "Ciarán Hinds" "Chris Williams"
## [19] "Stephen J. Anderson" "Maia Wilson"
## [21] "Edie McClurg" "Robert Pine"
## [23] "Maurice LaMarche" "Livvy Stubenrauch"
## [25] "Eva Bella" "female protagonist"
## [27] "sister sister relationship" "snowman"
## [29] "sister love" "magic"
## [31] "Walt Disney Animation Studios" "Walt Disney Pictures"
vignette("selectorgadget")
html_nodes()
Use SelectorGadget to find a CSS selector combination that identifies just the cast member names
cast2 <- html_nodes(frozen, "#titleCast span.itemprop")
html_text(cast2)
## [1] "Kristen Bell" "Idina Menzel" "Jonathan Groff"
## [4] "Josh Gad" "Santino Fontana" "Alan Tudyk"
## [7] "Ciarán Hinds" "Chris Williams" "Stephen J. Anderson"
## [10] "Maia Wilson" "Edie McClurg" "Robert Pine"
## [13] "Maurice LaMarche" "Livvy Stubenrauch" "Eva Bella"
cast3 <- html_nodes(frozen, ".itemprop .itemprop")
html_text(cast3)
## [1] "Kristen Bell" "Idina Menzel" "Jonathan Groff"
## [4] "Josh Gad" "Santino Fontana" "Alan Tudyk"
## [7] "Ciarán Hinds" "Chris Williams" "Stephen J. Anderson"
## [10] "Maia Wilson" "Edie McClurg" "Robert Pine"
## [13] "Maurice LaMarche" "Livvy Stubenrauch" "Eva Bella"
html_nodes()
and html_text()
sterling <- read_html("http://www.bestplaces.net/cost_of_living/city/virginia/sterling")
col <- html_nodes(sterling, css = "#mainContent_dgCostOfLiving tr:nth-child(2) td:nth-child(2)")
html_text(col)
## [1] "136"
# or use a piped operation
sterling %>%
html_nodes(css = "#mainContent_dgCostOfLiving tr:nth-child(2) td:nth-child(2)") %>%
html_text()
## [1] "136"
tables <- html_nodes(sterling, css = "table")
tables %>%
# get the second table
nth(2) %>%
# convert to data frame
html_table(header = TRUE)
## COST OF LIVING Sterling, Virginia United States
## 1 Overall 136.0 100
## 2 Grocery 113.9 100
## 3 Health 101.0 100
## 4 Housing 203.0 100
## 5 Utilities 107.0 100
## 6 Transportation 108.0 100
## 7 Miscellaneous 98.0 100
Extract the climate statistics of your hometown as a data frame with useful column names
sterling_climate <- read_html("http://www.bestplaces.net/climate/city/virginia/sterling")
climate <- html_nodes(sterling_climate, css = "table")
html_table(climate, header = TRUE, fill = TRUE)[[2]]
## CLIMATE Sterling, Virginia United States
## 1 Rainfall (in.) 42.0447 39.2
## 2 Snowfall (in.) 21.5351 25.8
## 3 Precipitation Days 74.1000 102
## 4 Sunny Days 197.0000 205
## 5 Avg. July High 87.4170 86.1
## 6 Avg. Jan. Low 23.9660 22.6
## 7 Comfort Index (higher=better) 47.0000 54
## 8 UV Index 4.0000 4.3
## 9 Elevation ft. 457.0000 1,443
sterling_climate %>%
html_nodes(css = "table") %>%
nth(2) %>%
html_table(header = TRUE)
## CLIMATE Sterling, Virginia United States
## 1 Rainfall (in.) 42.0447 39.2
## 2 Snowfall (in.) 21.5351 25.8
## 3 Precipitation Days 74.1000 102
## 4 Sunny Days 197.0000 205
## 5 Avg. July High 87.4170 86.1
## 6 Avg. Jan. Low 23.9660 22.6
## 7 Comfort Index (higher=better) 47.0000 54
## 8 UV Index 4.0000 4.3
## 9 Elevation ft. 457.0000 1,443