- Adapted from R for data science by Garrett Grolemund and Hadley Wickham.
2016-03-02
library(rvest)
src <- html("http://en.wikipedia.org/wiki/Table_(information)")
node <- html_node(src, css = ".wikitable")
".wikitable" is a CSS selector which says: "grab nodes (aka elements) with a class of wikitable".html_table() converts a single <table> node to a data frame.html_table(node) #> First name Last name Age #> 1 Tinu Elejogun 14 #> 2 Blaszczyk Kostrzewski 25 #> 3 Lily McGarrett 16 #> 4 Olatunkboh Chijiaku 22 #> 5 Adrienne Anthoula 22 #> 6 Axelia Athanasios 22 #> 7 Jon-Kabat Zinn 22
html("http://en.wikipedia.org/wiki/Table_(information)") %>%
html_node(".wikitable") %>% html_table()
#> First name Last name Age
#> 1 Tinu Elejogun 14
#> 2 Blaszczyk Kostrzewski 25
#> 3 Lily McGarrett 16
#> 4 Olatunkboh Chijiaku 22
#> 5 Adrienne Anthoula 22
#> 6 Axelia Athanasios 22
#> 7 Jon-Kabat Zinn 22
Navigate this page and try the following:
Easy: Grab the table at the bottom of the page (hint: instead of grabbing a node by class with html_node(".class"), you can grab by id with html_node("#id"))
Medium: Grab the actual mean, max, and min temperature.
Hard: Grab the weather history graph and write the figure to disk (download.file() may be helpful here).
See here for a solution (thanks Hadley Wickham for the example)
<table> data?domain <- "http://www.sec.gov"
susp <- paste0(domain, "/litigation/suspensions.shtml")
hrefs <- html(susp) %>% html_nodes("p+ table a") %>% html_attr(name = "href")
tail(hrefs)
#> [1] "/litigation/suspensions/2016/34-76961.pdf"
#> [2] "/litigation/suspensions/2016/34-76961-o.pdf"
#> [3] "/litigation/suspensions/2016/34-76939.pdf"
#> [4] "/litigation/suspensions/2016/34-76939-o.pdf"
#> [5] "/litigation/suspensions/2016/34-76860.pdf"
#> [6] "/litigation/suspensions/2016/34-76860-o.pdf"
# download all the pdfs! hrefs <- hrefs[!is.na(hrefs)] pdfs <- paste0(domain, hrefs) mapply(download.file, pdfs, basename(pdfs))
Nativigate to Wikipedia's list of data structures use SelectorGadget + rvest to do the following:
See here for a solution.
html("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>%
html_node("table")
#> Error in eval(expr, envir, enclos): No matches
<table>?html("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>%
html_node("body") %>% as.character() %>% cat()
#> <body>
#> A Simple Table made with JavaScript
#> <p/>
#> <script><![CDATA[
#> function tableCreate(){
#> var body = document.body,
#> tbl = document.createElement('table');
#>
#> for(var i = 0; i < 3; i++){
#> var tr = tbl.insertRow();
#> for(var j = 0; j < 3; j++){
#> var td = tr.insertCell();
#> td.appendChild(document.createTextNode("Cell"));
#> }
#> }
#> body.appendChild(tbl);
#> }
#> tableCreate();
#> ]]></script>
#> </body>
rdom can construct the DOM:
library(rdom)
rdom("http://bl.ocks.org/cpsievert/raw/2a9fb8f504cd56e9e8e3/") %>%
html_node("table") %>% html_table()
X1 X2 X3 1 Cell Cell Cell 2 Cell Cell Cell 3 Cell Cell Cell
You can give rdom() CSS Selectors directly to avoid sending the entire DOM from phantomjs to R
rdom("http://www.techstars.com/companies/stats/", "table") %>%
html_table()
GET.library(httr)
response <- GET("https://api.github.com/users/hadley")
content(response)[c("name", "company")]
#> $name
#> [1] "Hadley Wickham"
#>
#> $company
#> [1] "RStudio"
POST, PUT, DELETE, etc…
response$header[1:3] #> $server #> [1] "GitHub.com" #> #> $date #> [1] "Wed, 02 Mar 2016 04:12:22 GMT" #> #> $`content-type` #> [1] "application/json; charset=utf-8"
XML is a markup language that looks very similar to HTML.
<mariokart>
<driver name="Bowser" occupation="Koopa">
<vehicle speed="55" weight="25"> Wario Bike </vehicle>
<vehicle speed="40" weight="67"> Piranha Prowler </vehicle>
</driver>
<driver name="Peach" occupation="Princess">
<vehicle speed="54" weight="29"> Royal Racer </vehicle>
<vehicle speed="50" weight="34"> Wild Wing </vehicle>
</driver>
</mariokart>
XML2R is a framework to simplify acquistion of tabular/relational XML.
library(XML2R)
obs <- XML2Obs("https://gist.githubusercontent.com/cpsievert/85e340814cb855a60dc4/raw/651b7626e34751c7485cff2d7ea3ea66413609b8/mariokart.xml")
table(names(obs))
#> #> mariokart//driver mariokart//driver//vehicle #> 2 4
obs #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "55" "25" " Wario Bike " #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "40" "67" " Piranha Prowler " #> #> $`mariokart//driver` #> name occupation #> [1,] "Bowser" "Koopa" #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "54" "29" " Royal Racer " #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "50" "34" " Wild Wing " #> #> $`mariokart//driver` #> name occupation #> [1,] "Peach" "Princess"
collapse_obs(obs) # group into table(s) by observational name/unit #> $`mariokart//driver` #> name occupation #> [1,] "Bowser" "Koopa" #> [2,] "Peach" "Princess" #> #> $`mariokart//driver//vehicle` #> speed weight XML_value #> [1,] "55" "25" " Wario Bike " #> [2,] "40" "67" " Piranha Prowler " #> [3,] "54" "29" " Royal Racer " #> [4,] "50" "34" " Wild Wing "
obs <- add_key(obs, parent = "mariokart//driver", recycle = "name") collapse_obs(obs) #> $`mariokart//driver` #> name occupation #> [1,] "Bowser" "Koopa" #> [2,] "Peach" "Princess" #> #> $`mariokart//driver//vehicle` #> speed weight XML_value name #> [1,] "55" "25" " Wario Bike " "Bowser" #> [2,] "40" "67" " Piranha Prowler " "Bowser" #> [3,] "54" "29" " Royal Racer " "Peach" #> [4,] "50" "34" " Wild Wing " "Peach"
Now (if I want) I can merge the tables into a single table…
tabs <- collapse_obs(obs) merge(tabs[[1]], tabs[[2]], by = "name") #> name occupation speed weight XML_value #> 1 Bowser Koopa 55 25 Wario Bike #> 2 Bowser Koopa 40 67 Piranha Prowler #> 3 Peach Princess 54 29 Royal Racer #> 4 Peach Princess 50 34 Wild Wing
[
{
"driver": "Bowser",
"occupation": "Koopa",
"vehicles": [
{
"model": "Wario Bike",
"speed": 55,
"weight": 25
},
{
"model": "Piranha Prowler",
"speed": 40,
"weight": 67
}
]
},
{
"driver": "Peach",
"occupation": "Princess",
"vehicles": [
{
"model": "Royal Racer",
"speed": 54,
"weight": 29
},
{
"model": "Wild Wing",
"speed": 50,
"weight": 34
}
]
}
]
library(jsonlite)
mario <- fromJSON("http://bit.ly/mario-json")
str(mario) # nested data.frames?!?
#> 'data.frame': 2 obs. of 3 variables:
#> $ driver : chr "Bowser" "Peach"
#> $ occupation: chr "Koopa" "Princess"
#> $ vehicles :List of 2
#> ..$ :'data.frame': 2 obs. of 3 variables:
#> .. ..$ model : chr "Wario Bike" "Piranha Prowler"
#> .. ..$ speed : int 55 40
#> .. ..$ weight: int 25 67
#> ..$ :'data.frame': 2 obs. of 3 variables:
#> .. ..$ model : chr "Royal Racer" "Wild Wing"
#> .. ..$ speed : int 54 50
#> .. ..$ weight: int 29 34
mario$driver #> [1] "Bowser" "Peach" mario$vehicles #> [[1]] #> model speed weight #> 1 Wario Bike 55 25 #> 2 Piranha Prowler 40 67 #> #> [[2]] #> model speed weight #> 1 Royal Racer 54 29 #> 2 Wild Wing 50 34
How do we get two tables (with a common id) like the XML example?
# this mapply statement is essentially equivalent to add_key
vehicles <- Map(function(x, y) cbind(x, driver = y),
mario$vehicles, mario$driver)
Reduce(rbind, vehicles)
#> model speed weight driver
#> 1 Wario Bike 55 25 Bowser
#> 2 Piranha Prowler 40 67 Bowser
#> 3 Royal Racer 54 29 Peach
#> 4 Wild Wing 50 34 Peach
mario[!grepl("vehicle", names(mario))]
#> driver occupation
#> 1 Bowser Koopa
#> 2 Peach Princess
# install dependencies and run first example (press ESC to quit)
if (!require("shiny")) install.packages("shiny")
if (!require("leaflet")) install.packages("leaflet")
runGitHub("rstudio/shiny-examples", subdir = "063-superzip-example")
library(shiny)
library(ggplot2)
ui <- fluidPage(
numericInput(
inputId = "size",
label = "Choose a point size",
value = 3, min = 1, max = 10
),
plotOutput("plotId")
)
server <- function(input, output) {
output$plotId <- renderPlot({
ggplot(mtcars, aes(wt, mpg)) +
geom_point(size = input$size)
})
}
shinyApp(ui, server)
ui <- fluidPage(
sidebarPanel(
selectInput(
inputId = "x", label = "Choose an x variable", choices = names(mtcars)
),
selectInput(
inputId = "y", label = "Choose an y variable", choices = names(mtcars)
)
),
mainPanel(
plotOutput("plotId")
)
)
server <- function(input, output) {
output$plotId <- renderPlot({
ggplot(mtcars, aes_string(input$x, input$y)) +
geom_point()
})
}
shinyApp(ui, server)
colour.