0

enter image description here

enter image description here

structure(list(Property = c("1B - Anantara", "1B - Mag540", "1B- Downtown Views", 
"1B- Tiara Tanzanite", "1B-Address JBR", "1B-Al Samar 1"), Airbnb_link = c("https://www.airbnb.co.in/rooms/552037226634913505?preview_for_ml=true&source_impression_id=p3_1654086364_RJjGWicrEoR%2FB%2Bgu", 
"https://www.airbnb.co.in/rooms/54045333?preview_for_ml=true&source_impression_id=p3_1644216409_ftDpMWrY34gbixtv", 
"https://www.airbnb.co.in/rooms/54360731?preview_for_ml=true&source_impression_id=p3_1649243904_EjWEoEoKTYpW1zaT", 
"https://www.airbnb.co.in/rooms/565630731118783569?preview_for_ml=true&source_impression_id=p3_1649245563_mMhnLLQhlqTS26sb", 
"https://www.airbnb.co.in/rooms/53245239?preview_for_ml=true&source_impression_id=p3_1644215345_i3xkL5TcGvenCy2j", 
"https://www.airbnb.co.in/rooms/582870857307525571?preview_for_ml=true&source_impression_id=p3_1649244680_HqkKrgHn26RnnioX"
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))

i have written same code sometimes it scrapes complete data sometime it gives Character(0) all property i don't know why this is happening i have given two instances where it gives NA for all property but sometimes same code gives result for all the property some times it gives result for some and leaves NA for Remaining.

library(rvest)
library(dplyr)
library(RSelenium)
rD <-  rsDriver(browser="chrome",port=5234L,chromever="105.0.5195.19")
remDr <- rD$client

Airbnb_link$Review_count <- sapply(Airbnb_link$Airbnb_link,function(url) {
  remDr$navigate(url)
  remDr$getPageSource()[[1]] %>% 
    read_html() %>% 
    html_nodes("span._s65ijh7") %>% 
    html_text2()
})
r2evans
  • 141,215
  • 6
  • 77
  • 149
  • Can anyone help why this is happening. – shubham tiwari Sep 20 '22 at 18:28
  • How do you know that each individual page has fully loaded and rendered before your `getPageSources()` calls? For debugging I'd collect page sources or `read_html()` outputs to a list to be able to check what's actually in there. But without a delay or some sort of wait and/or polling mechanism it's surprising that you sometimes do get a valid(?) results. I'd also be cautious using apply instead of simple for loop in such cases, even if it technically works(*if* it works), it kind of hints that it would be ok to run parallel too (that would definitely not work with a single global web driver) – margusl Sep 21 '22 at 14:27

1 Answers1

0

I ran the code below many times and I obtained the same result :

library(rvest)
library(dplyr)
library(RSelenium)
port <- as.integer(4444L + rpois(lambda = 1000, 1))
rd <- rsDriver(chromever = "105.0.5195.52", browser = "chrome", port = port)
remDr <- rd$client
remDr$open()

Airbnb_link <- c("https://www.airbnb.co.in/rooms/552037226634913505?preview_for_ml=true&source_impression_id=p3_1654086364_RJjGWicrEoR%2FB%2Bgu", 
                 "https://www.airbnb.co.in/rooms/54045333?preview_for_ml=true&source_impression_id=p3_1644216409_ftDpMWrY34gbixtv", 
                 "https://www.airbnb.co.in/rooms/54360731?preview_for_ml=true&source_impression_id=p3_1649243904_EjWEoEoKTYpW1zaT", 
                 "https://www.airbnb.co.in/rooms/565630731118783569?preview_for_ml=true&source_impression_id=p3_1649245563_mMhnLLQhlqTS26sb", 
                 "https://www.airbnb.co.in/rooms/53245239?preview_for_ml=true&source_impression_id=p3_1644215345_i3xkL5TcGvenCy2j", 
                 "https://www.airbnb.co.in/rooms/582870857307525571?preview_for_ml=true&source_impression_id=p3_1649244680_HqkKrgHn26RnnioX")


nb_Link <- length(Airbnb_link)
list_Val <- list()

for(i in 1 : nb_Link)
{
  print(i)
  remDr$navigate(Airbnb_link[i])
  Sys.sleep(10)
  list_Val[[i]] <- remDr$getPageSource()[[1]] %>% read_html() %>% html_nodes("span._s65ijh7") %>% html_text2()
}
Emmanuel Hamel
  • 1,769
  • 7
  • 19