2

Im sorry if this is a brainf*rt question - its probably a simple error handling. This code breaks when one of the variables hits a blank (in this case in the 'num_views' variable) - Is there a way to return an 'NA' for any blank values? I would be so grateful for any advice

The error response is: Error: All columns in a tibble must be vectors. Column num_views is a function.

# Custom functions
parse_listing <- function(listing){
  
  # Review content
  address <- listings %>% html_nodes(xpath = '//p[@data-testid="listing-description"]') %>% html_text2()
  link <- listings %>% html_nodes(xpath = '//div[@data-testid="search-result"]/div/div/a[2]') %>% html_attr('href') %>% paste("https://www.zoopla.co.uk", ., sep="")
  prop_type <- listings %>% html_nodes(xpath = '//h2[@data-testid="listing-title"]') %>% html_text2()
  price <- listings %>% html_nodes(xpath = '//div[@data-testid="listing-price"]/p[@size="6"]') %>% html_text2() %>% str_remove_all("[£,]")
  est_agent <- listings %>% html_nodes(xpath = '//div[@data-testid="search-result"]//a/img') %>% html_attr('alt') %>% str_remove('Marketed by ')
  date_listed <- listings %>% html_nodes(xpath = '//span[@data-testid="date-published"]') %>% html_text2() %>% str_remove('Listed on ')  %>% dmy()
  num_views <- possibly(listings %>% html_nodes(xpath = '//span[@data-testid="number-of-views"]') %>% html_text2() %>% str_remove(' views'), otherwise = NULL)
  
  tibble(address, link, prop_type, price, est_agent, date_listed, num_views)
}

# Script
link <- 'https://www.zoopla.co.uk/for-sale/property/wd3/?page_size=25&q=wd3&radius=0&results_sort=most_popular&search_source=refine&pn=7'
page <- read_html(link)
listings <- page %>% html_nodes(xpath = '//div[@data-testid="search-result"]')

wd3p7 <- map_dfr(listings, parse_listing)
Robert Chestnutt
  • 302
  • 3
  • 13
  • 1
    We may use `possibly/safely` wrapped on the `parse_listing` i.e. `p_parse_listing <- possibly(parse_listing, otherwise = NA)` and then use `map_dfr(listing, p_parse_lsting)` – akrun Feb 22 '22 at 16:57
  • Your input argument to `parse_listing` is `listing` whereas within the function, it uses `listings`. Please do the correction – akrun Feb 22 '22 at 17:02

1 Answers1

2

Wrap with a tryCatch or possibly/safely (from purrr) to return the desired value when there is an error

library(purrr)
library(rvest)
library(dplyr)
library(lubridate)
 parse_listing <- function(listing){
  
    # Review content
    address <- possibly(function(x)  {
                   x %>%
                    html_nodes(xpath = '//p[@data-testid="listing-description"]') %>%
                      html_text2()
              }, otherwise = NA_character_)(listing)
              
   link <- possibly(function(x) {
        x %>% 
          html_nodes(xpath = '//div[@data-testid="search-result"]/div/div/a[2]') %>%
           html_attr('href') %>% 
           paste("https://www.zoopla.co.uk", ., sep="")
           }, otherwise = NA_character_)(listing)
           
   prop_type <- possibly(function(x) {
        x %>% 
         html_nodes(xpath = '//h2[@data-testid="listing-title"]') %>% 
         html_text2()
         }, otherwise = NA_character_)(listing)
         
   price <- possibly(function(x) {
     x %>% 
      html_nodes(xpath = '//div[@data-testid="listing-price"]/p[@size="6"]') %>%
       html_text2() %>% 
       str_remove_all("[£,]")
        }, otherwise = NA_character_)(listing)
   est_agent <- possibly(function(x) {
    x %>% 
     html_nodes(xpath = '//div[@data-testid="search-result"]//a/img') %>%
      html_attr('alt') %>% 
      str_remove('Marketed by ')
      }, otherwise = NA_character_)(listing)  
   date_listed <- possibly(function(x) {
      x %>% 
       html_nodes(xpath = '//span[@data-testid="date-published"]') %>% 
       html_text2() %>% 
       str_remove('Listed on ')  %>% 
       dmy()
         }, otherwise = NA_character_)(listing)  
   num_views <- possibly(function(x) {
      x %>% 
       html_nodes(xpath = '//span[@data-testid="number-of-views"]') %>% 
       html_text2() %>% 
       str_remove(' views')
       }, otherwise = NA_character_)(listing)   
   type.convert(as_tibble(do.call(qpcR:::cbind.na, dplyr::lst(address, link, prop_type, price, est_agent, date_listed, num_views))), as.is = TRUE)
   
    }

-testing

link <- 'https://www.zoopla.co.uk/for-sale/property/wd3/?page_size=25&q=wd3&radius=0&results_sort=most_popular&search_source=refine&pn=7'
page <- read_html(link)
listings <- page %>% html_nodes(xpath = '//div[@data-testid="search-result"]')
 wd3p7 <- map_dfr(setNames(listings, listings), parse_listing, .id = "listing")

-output

> wd3p7
# A tibble: 625 × 8
   listing                                         address               link                prop_type     price est_agent        date_listed num_views
   <chr>                                           <chr>                 <chr>               <chr>         <int> <chr>                  <int>     <int>
 1 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 2 bed flat … 400000 Gibbs Gillespie…       19039        40
 2 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed prope… 315000 Trend & Thomas,…       19033        34
 3 "<div data-testid=\"search-result\" class=\"ea… Springwell Lane, Ric… https://www.zoopla… 2 bed flat … 375000 Purplebricks, H…       19044        32
 4 "<div data-testid=\"search-result\" class=\"ea… Rectory Road, Rickma… https://www.zoopla… 1 bed flat … 315000 Trend & Thomas,…       18897        30
 5 "<div data-testid=\"search-result\" class=\"ea… Penn House, 30 High … https://www.zoopla… Studio for … 270000 Gibbs Gillespie…       18982        25
 6 "<div data-testid=\"search-result\" class=\"ea… The Forge, Bury Lane… https://www.zoopla… 2 bed flat … 425000 Robsons, WD3           19005        25
 7 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed prope… 299950 Trend & Thomas,…       18792        26
 8 "<div data-testid=\"search-result\" class=\"ea… The Forge, Bury Lane… https://www.zoopla… 2 bed flat … 450000 Savills - Rickm…       18960        19
 9 "<div data-testid=\"search-result\" class=\"ea… Homestead Road, Rick… https://www.zoopla… 1 bed flat … 279000 Trend & Thomas,…       18654        18
10 "<div data-testid=\"search-result\" class=\"ea… High Street, Rickman… https://www.zoopla… 1 bed flat … 270000 Trend & Thomas,…       18463        18
# … with 615 more rows
akrun
  • 874,273
  • 37
  • 540
  • 662
  • Thanks Akrun, it still doesnt work :( I think what I need to do is run the code by listing. There are 25 listings per page, and I think RVest is scraping all the variables individually and then column binding them. Then this serves to break it as there should be 25 on each page, and 25 values for each variable. However, in this case there is only 17 values in the `num_views` variable. This appears to be where the problem lies. What I probably need is to loop through each listing on each page, then loop through the pages. Thanks for your help – Robert Chestnutt Feb 23 '22 at 12:51
  • 1
    @RobertChestnutt thanks. I updated the post. YOu can call each of them with `possibly`. Also, the `num_views` length is less than the others. So, can use `cbind.na` to pad `NA` at the end before converting to `tibble` – akrun Feb 23 '22 at 18:47