Issue
This Content is from Stack Overflow. Question asked by bandcar
I have two tables of different lengths that I would like to scrape and put in the same data frame. The first only has four columns while the second has 7.
I would like for the table to show NA for nodes that are not present on that page. So in the first photo, it would show NA for the columns adultWorldChamp, M1WorldChamp, and M2WorldChamp. I’m just not really sure how to do this. Any guidance?
library(rvest); library(tidyverse)
library(RSelenium); library(netstat)
links2 = c("https://web.archive.org/web/20220000000000*/https://www.bjjcompsystem.com/tournaments/1869/categories/2053146",
"https://web.archive.org/web/20220000000000*/https://www.bjjcompsystem.com/tournaments/1869/categories/2053225")
# Start server
remote_driver = rsDriver(browser = 'firefox',
verbose = F,
port = free_port())
rd = remote_driver$client
rd$open()
rd$navigate('https://web.archive.org/web/20220000000000*/https://www.bjjcompsystem.com/tournaments/1869/categories/2053147&')
rd$maxWindowSize()
## create empty vector
all.ranks = data.frame()
# Start scraping! ----
for (i in 1:2){
rd$navigate(links2[i])
Sys.sleep(10)
date = rd$findElement(using = 'css', '.captures-range-info a:last-of-type')
date$clickElement()
Sys.sleep(10)
# Get pg source to read html data
html = read_html(rd$getPageSource()[[1]])
rank = data.frame(rank = html %>%
html_nodes('.prioriry-number') %>%
html_text(),
name = html %>%
html_elements(xpath = '//tbody//th') %>%
html_text(),
adultWorldChamp = html %>%
html_nodes('td:nth-child(4)') %>%
html_text(),
M1WorldChamp = html %>%
html_nodes('td:nth-child(4)') %>%
html_text() ,
M2WorldChamp = html %>%
html_nodes('td:nth-child(5)') %>%
html_text(),
gym = html %>%
html_elements(xpath = '//tbody//th') %>%
html_text(),
grand_slam_pts = html %>%
html_nodes('td:nth-child(3)') %>%
html_text(),
overall_pts = html %>%
html_nodes('.text-center+ td') %>%
html_text())
all.ranks = rbind(all.ranks, rank)
}
Solution
This could be achieved with bind_rows
from dplyr
library(tidyverse)
library(rvest)
library(magrittr)
df_1 <- "https://www.bjjcompsystem.com/tournaments/1869/categories/2053147&" %>%
read_html() %>%
html_table() %>%
getElement(1) %>%
set_colnames(c("name", "team", "grand_slam", "overall")) %>%
na.omit()
df_2 <- "https://www.bjjcompsystem.com/tournaments/1869/categories/2053225" %>%
read_html() %>%
html_table() %>%
getElement(1) %>%
set_colnames(c("name", "team", "adult", "m1", "m2", "grand_slam", "overall")) %>%
na.omit()
# A tibble: 55 × 7
name team grand_slam overall adult m1 m2
<chr> <chr> <int> <int> <chr> <chr> <chr>
1 "1\nMark Luebking" CheckMat 108 111 NA NA NA
2 "2\nLio Alexander Duarte" Brazilian To… 36 49 NA NA NA
3 "3\nOscar Alberto Borrego" Patrick Alme… 12 18 NA NA NA
4 "4\nLuis Alberto Ogaz Varela" Soul Fighter… 0 108 NA NA NA
5 "5\nGeorge Christopher A. Baron" Aloisio Silv… 0 4 NA NA NA
6 "6\nOmar Eduardo Alfaro Solis" LEAD BJJ 0 0 NA NA NA
7 "7\nLucas John Wagner" Alliance 0 0 NA NA NA
8 "8\nFrancisco J. O'Ryan Lesser" Cohab Chile 0 0 NA NA NA
9 "9\nBryan Nguyen" Logic 0 0 NA NA NA
10 "1\nJoe Hisataka Frederick Chiaki Scovel" SF Empire Ji… 189 189 No No Yes
11 "2\nDiego de Araujo Saraiva" Nova União 224 809 No No No
12 "3\nDavid Dimopoulos Said" Alliance Int… 191 151 No No No
13 "4\nOsvaldo Augusto Honorio Moizinho" Ares BJJ 57 137 No No No
14 "5\nDiego Asenjo de Paula" Fabin Rosa B… 47 269 No No No
15 "6\nJohnson Clay Telford" G13 BJJ USA 36 225 No No No
# … with 40 more rows
This Question was asked in StackOverflow by bandcar and Answered by Tom Hoel It is licensed under the terms of CC BY-SA 2.5. - CC BY-SA 3.0. - CC BY-SA 4.0.