[SOLVED] Scrape tables of unequal lengths and fill with NA when html node is not present in R

Issue

This Content is from Stack Overflow. Question asked by bandcar

I have two tables of different lengths that I would like to scrape and put in the same data frame. The first only has four columns while the second has 7.

I would like for the table to show NA for nodes that are not present on that page. So in the first photo, it would show NA for the columns adultWorldChamp, M1WorldChamp, and M2WorldChamp. I’m just not really sure how to do this. Any guidance?

library(rvest); library(tidyverse)
library(RSelenium); library(netstat)

links2 = c("https://web.archive.org/web/20220000000000*/https://www.bjjcompsystem.com/tournaments/1869/categories/2053146",
"https://web.archive.org/web/20220000000000*/https://www.bjjcompsystem.com/tournaments/1869/categories/2053225")

# Start server
remote_driver = rsDriver(browser = 'firefox',
                         verbose = F,
                         port = free_port())
rd = remote_driver$client
rd$open()
rd$navigate('https://web.archive.org/web/20220000000000*/https://www.bjjcompsystem.com/tournaments/1869/categories/2053147&')
rd$maxWindowSize()

## create empty vector  
all.ranks = data.frame()

# Start scraping! ----
for (i in 1:2){
  rd$navigate(links2[i])
  Sys.sleep(10)
  date = rd$findElement(using = 'css', '.captures-range-info a:last-of-type')
  date$clickElement()
  Sys.sleep(10)
  
  # Get pg source to read html data
  html = read_html(rd$getPageSource()[[1]])

  rank = data.frame(rank = html %>%
                      html_nodes('.prioriry-number') %>%
                      html_text(),
                    name = html %>%
                      html_elements(xpath = '//tbody//th') %>%
                      html_text(),
                    adultWorldChamp = html %>%
                      html_nodes('td:nth-child(4)') %>%
                      html_text(),
                    M1WorldChamp = html %>%
                      html_nodes('td:nth-child(4)') %>%
                      html_text() ,
                    M2WorldChamp = html %>%
                      html_nodes('td:nth-child(5)') %>%
                      html_text(),
                    gym = html %>%
                      html_elements(xpath = '//tbody//th') %>%
                      html_text(),
                    grand_slam_pts = html %>%
                      html_nodes('td:nth-child(3)') %>%
                      html_text(),
                    overall_pts = html %>%
                      html_nodes('.text-center+ td') %>%
                      html_text())

all.ranks = rbind(all.ranks, rank)
}

Table with only four columns

table with 7 columns



Solution

This could be achieved with bind_rows from dplyr

library(tidyverse)
library(rvest)
library(magrittr)

df_1 <- "https://www.bjjcompsystem.com/tournaments/1869/categories/2053147&" %>% 
  read_html() %>% 
  html_table() %>% 
  getElement(1) %>% 
  set_colnames(c("name", "team", "grand_slam", "overall")) %>% 
  na.omit()
  
df_2 <- "https://www.bjjcompsystem.com/tournaments/1869/categories/2053225" %>% 
  read_html() %>% 
  html_table() %>% 
  getElement(1) %>% 
  set_colnames(c("name", "team", "adult", "m1", "m2", "grand_slam", "overall")) %>% 
  na.omit()

# A tibble: 55 × 7
   name                                      team          grand_slam overall adult m1    m2   
   <chr>                                     <chr>              <int>   <int> <chr> <chr> <chr>
 1 "1\nMark Luebking"                        CheckMat             108     111 NA    NA    NA   
 2 "2\nLio Alexander Duarte"                 Brazilian To…         36      49 NA    NA    NA   
 3 "3\nOscar Alberto Borrego"                Patrick Alme…         12      18 NA    NA    NA   
 4 "4\nLuis Alberto Ogaz Varela"             Soul Fighter…          0     108 NA    NA    NA   
 5 "5\nGeorge Christopher A. Baron"          Aloisio Silv…          0       4 NA    NA    NA   
 6 "6\nOmar Eduardo Alfaro Solis"            LEAD BJJ               0       0 NA    NA    NA   
 7 "7\nLucas John Wagner"                    Alliance               0       0 NA    NA    NA   
 8 "8\nFrancisco J. O'Ryan Lesser"           Cohab Chile            0       0 NA    NA    NA   
 9 "9\nBryan Nguyen"                         Logic                  0       0 NA    NA    NA   
10 "1\nJoe Hisataka Frederick Chiaki Scovel" SF Empire Ji…        189     189 No    No    Yes  
11 "2\nDiego de Araujo Saraiva"              Nova União           224     809 No    No    No   
12 "3\nDavid Dimopoulos Said"                Alliance Int…        191     151 No    No    No   
13 "4\nOsvaldo Augusto Honorio Moizinho"     Ares BJJ              57     137 No    No    No   
14 "5\nDiego Asenjo de Paula"                Fabin Rosa B…         47     269 No    No    No   
15 "6\nJohnson Clay Telford"                 G13 BJJ USA           36     225 No    No    No   
# … with 40 more rows


This Question was asked in StackOverflow by bandcar and Answered by Tom Hoel It is licensed under the terms of CC BY-SA 2.5. - CC BY-SA 3.0. - CC BY-SA 4.0.

people found this article helpful. What about you?