1

I've been working on a project where the goal is to take a two-column CSV of street addresses and zip codes, read it into R, then perform a Zillow query for each one (GetSearchResults, specifically), parse the output, and store the parsed output in a dataframe to be written to a CSV (and placed right next to the existing data).

caveat: I can only call one address/zip combo at a time through the zillow API, so anything that violates that is off the table immediately.

As of this point, I have about 85% of the work done. I have i) a bit of code that can, one-by-one, query those address/zip combos from a dataframe as well as ii) a tentative way of putting that input back into a dataframe,

library(ZillowR)
library(rvest)
library(dplyr)
library(DT)

# this commented section is what I would use instead of creating the dataframe manually below, just for clarity
# data1 = read.csv('Addresses.csv', header = F, colClasses = 'character')$V1
# data2 = read.csv('Addresses.csv', header = F, colClasses = 'character')$V2
# data = data.frame(street = data1, city.state = as.character(data2))
# per comments, should add a "stringsAsFactors = FALSE" in the dataframe part

data = data.frame(
    street = c('77 Huntington Ave',
             '85 Prospect St',
             '219 Lincoln St'),
    city.state = c(rep('01752', 3)))

get.zillowdata = function(df, address, city.state){
    require(ZillowR)
    set_zillow_web_service_id('API KEY')
    results = do.call(rbind, lapply(1:nrow(df), function(i){
        z = tryCatch({
        zdata = GetDeepSearchResults(address = df$street[i],
                  citystatezip = df$city.state[i],
                  zws_id = getOption("ZillowR-zws_id"),
                  url = "http://www.zillow.com/webservice/GetDeepSearchResults.htm")
        return(zdata)
    },

    error = function(cond) {
      message(paste("No Data Available:", df$street[i], df$city.state[i]))
      return(NA) # Choose a return value in case of error
    },

    warning = function(cond) {
      message(paste("Zdata caused a warning:", df$street[i], df$city.state[i]))
      return(NA) # Choose a return value in case of warning
    },
    # print processing message to screen
    finally = {
      message(paste("Processed Address:", df$street[i], df$city.state[i]))
      message(paste(i, "of", nrow(df), 'processed'))
      }
    )
    }))

if(nrow(results)==nrow(df)){
    results = cbind(df, results)

    print(paste('Original data had', nrow(df), 'rows. Returning a dataframe with', nrow(results),
    'rows. Returned dataframe has', sum(is.na(results$amount)), 'missing zdata values.'))

  return(results)
}
    else(print("Error: nrows(df) do not match nrows(zdata)"))
}

get.zillowdata(data)
` 

and also iii) a parser for the XMLnode response that you get when you perform a query through the Zillow API which picks out specific child values (zestimate, square footage, lot size, etc; whatever you specify)

library(ZillowR)
library(XML)
library(RCurl)

set_zillow_web_service_id('API KEY')
output123 = GetDeepSearchResults(address = 'STREET ADDRESS', citystatezip = '0ZIP CODE', zws_id = getOption("ZillowR-zws_id"), url = "http://www.zillow.com/webservice/GetSearchResults.htm")

results <- xmlToList(output123$response[["results"]])

getValRange <- function(x, hilo) {
  ifelse(hilo %in% unlist(dimnames(x)), x["text",hilo][[1]], NA)
}

out <- apply(results, MAR=2, function(property) {
  zpid <- property$zpid
  links <- unlist(property$links)
  address <- unlist(property$address)
  z <- property$zestimate
  zestdf <- list(
    amount=ifelse("text" %in% names(z$amount), z$amount$text, NA),
    lastupdated=z$"last-updated",
    valueChange=ifelse(length(z$valueChange)==0, NA, z$valueChange),
    valueLow=getValRange(z$valuationRange, "low"),
    valueHigh=getValRange(z$valuationRange, "high"),
    percentile=z$percentile)
  list(id=zpid, links, address, zestdf)
})

data <- as.data.frame(do.call(rbind, lapply(out, unlist)),
                      row.names=seq_len(length(out)))

But I'm a little stuck at this point. How should I put these together so that I can include the parsing at the end of the api call part and make sure that both of them get iterated over the full list of addresses/zips? My code right now isn't in any particular order, so feel free to move things around if you decide to tackle this, and if anyone needs additional information, I'm happy to clarify!

Thanks very much in advance.

Sean
  • 21
  • 4
  • I'm confused about what your question is. It sounds like you can make the API calls, and put the results in a data frame. And you can parse those results, to get the bits you want. First get the data, then parse the data. What's the issue? – Gregor Thomas Jan 26 '18 at 20:47
  • I just need some way of ordering the code I have so that everything gets looped correctly. Like, one loop should be: get an address/zip combo from the dataframe -> make an API call -> parse the output -> write parsed output to a dataframe -> grab the next address/zip combo -> etc. I'm just unclear on how to incorporate the two bits of code I have in such a way that the parsing bit gets incorporated into each loop. – Sean Jan 26 '18 at 21:08
  • I can't work on this more now, but I can take a look later. Could you add a few working inputs? Three rows is plenty, but I assume the `data` you provide now with `"ADDRESS1"` doesn't actually work. – Gregor Thomas Jan 26 '18 at 21:10
  • sure, can do; should be updated in the next couple minutes. and again, I wouldn't be constructing the dataframe manually. it's just for demonstration purposes that I have the "ADDRESS1:3" part in there. – Sean Jan 26 '18 at 21:16
  • one other caveat is that the address and zip code parts of the Zillow query MUST be in character vector form, as I learned the tedious way; hence the seemingly out-of-place code in the commented out section. – Sean Jan 26 '18 at 21:30
  • I haven't gotten the API to work for me (maybe it takes time for key to activate?), but essentially your problem is that you've worked so hard to make your functions take and return data frames. This is normally good practice at the macro level, since you can do only one API call at a time it makes no sense here - there's no vectorization to take advantage of. – Gregor Thomas Jan 27 '18 at 05:21
  • I'd recommend rewriting your functions into smaller pieces that take one thing at a time. Then you can write a wrapper that complete processes one row at a time. And then it's easy to iterate that function over your data frame with `apply` or `for` or whatever. – Gregor Thomas Jan 27 '18 at 05:22
  • NM - my problem was the factor/character thing. *Why would you give me factor data? Put a `stringsAsFactors = FALSE` in your `data.frame` call to fix it, don't just warn people about it!* – Gregor Thomas Jan 27 '18 at 05:25
  • I am having trouble with the XML. My current `xml2` package doesn't have `xmlToList` – Gregor Thomas Jan 27 '18 at 05:38
  • Thanks for all your input so far, I very much appreciate it. Definitely am still trying to do too much at once because I'm not an incredibly experienced programmer, but will try to figure out a way to make things simpler along the "one at a time" advice. And sorry about the factor bit, I had made a mental note to change that in my code several times but completely forgot :\ the XML part I think is also my bad; I was experimenting with a couple different libraries and accidentally left the "2" in there. xmlToList is a function in the regular "XML" library. will change that in the code above – Sean Jan 27 '18 at 16:05
  • also, if you still need a working API key, I'd be happy to email mine to you if yours still isn't usable. – Sean Jan 27 '18 at 16:10
  • Thanks, but I got the API working (it was just the character/factor issue). I'll take another look this evening and try use the `xml` package. – Gregor Thomas Jan 27 '18 at 16:27
  • hey, just wanted to check in briefly; were you able to get past the xmlToList part? – Sean Jan 28 '18 at 21:37
  • Sorry man, I can't tell what's going on. I can get data `z` for a single address, I can do `xmlToList` on the `$response`, but I can't even get started on your `apply` - I see no `zpid`, no `zestimate`.... – Gregor Thomas Jan 31 '18 at 03:20
  • @Sean, did you get this to work . Would help if you could share the code – Vaibhav Singh Sep 04 '18 at 05:23

1 Answers1

0

This is a semi simplified answer to your question(I mean I removed some of the information that api provides, such as high low estimates but you can add them back it following the logic for the other provided info in the code below). First loading in the required packages:

ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg)) 
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
#install and load the following packages 
packages <- c("ZillowR", "rjson", "httr", "XML")
ipak(packages)

Next set your Zillow API Key:

set_zillow_web_service_id('API KEY')

Next defining a dataset with only addresses, and then defining and adding columns of street address, and zip code to it, so we can use them later with Zillow api:

df <-data.frame("Address" = c("5435 Andrea Boulevard, Sacramento, CA 95842","8434 Walerga Road Apt 421, Antelope, CA 95843"
       ,"7152 Flanders Way, Sacramento, CA 95842","1076 Edmonton Drive, Sacramento, CA 95833"    
       ,"9906 Burline Street, Sacramento, CA 95827","3634 Sapphire Drive Apt 1, Auburn, CA 95602"),stringsAsFactors = FALSE)
df$StreetCity<-gsub("(.*),.*", "\\1", df$Address)
df$Street<-gsub("(.*),.*", "\\1", df$StreetCity)
df$zip<-substr(df$Address,nchar(df$Address)-4,nchar(df$Address))
df
#                                         Address                          StreetCity                    Street   zip
# 1   5435 Andrea Boulevard, Sacramento, CA 95842   5435 Andrea Boulevard, Sacramento     5435 Andrea Boulevard 95842
# 2 8434 Walerga Road Apt 421, Antelope, CA 95843 8434 Walerga Road Apt 421, Antelope 8434 Walerga Road Apt 421 95843
# 3       7152 Flanders Way, Sacramento, CA 95842       7152 Flanders Way, Sacramento         7152 Flanders Way 95842
# 4     1076 Edmonton Drive, Sacramento, CA 95833     1076 Edmonton Drive, Sacramento       1076 Edmonton Drive 95833
# 5     9906 Burline Street, Sacramento, CA 95827     9906 Burline Street, Sacramento       9906 Burline Street 95827
# 6   3634 Sapphire Drive Apt 1, Auburn, CA 95602   3634 Sapphire Drive Apt 1, Auburn 3634 Sapphire Drive Apt 1 95602

Now lets hit the API:

getValRange <- function(x, hilo) {
  ifelse(hilo %in% unlist(dimnames(x)), x["text",hilo][[1]], NA)
}

Out_df_All <-NULL
Out_df <-NULL

for (i in 1:nrow(df)){
  print(i)
  xml =GetSearchResults(address = as.character(df[i,]$Street), citystatezip = as.character(df[i,]$zip),rentzestimate = TRUE)
  if (!is.null(xml$response[["results"]])){
    results <- xmlToList(xml$response[["results"]])
    if (is.matrix(results)){
      out <- apply(results, MAR=2, function(property) {
        zpid <- property$zpid
        links <- unlist(property$links)["mapthishome"]
        address <- unlist(property$address)
        z <-property$zestimate
        zestdf <- list(
          zestimate=ifelse("text" %in% names(z$amount), z$amount$text, NA)
        ) 

        rz <- property$rentzestimate
        rentalzestdf <- list(
          rentzestimate=ifelse("text" %in% names(rz$amount), rz$amount$text, NA)
        ) 
        list(zpid=zpid, links, address, zestdf, rentalzestdf)
      }


      )
      data <- as.data.frame(do.call(rbind, lapply(out, unlist)[1]),row.names=seq_len(length(out)))
      data[is.na(data)] <- 0
      Out_df<-data.frame("Address"=df[i,"Address"],data)
      Out_df_All<-rbind(Out_df,Out_df_All)
    }

    else {
      data<-as.data.frame(do.call(rbind, lapply(xml$response[["results"]][1], unlist)[1]),row.names=seq_len(length(xml$response[["results"]][1])))
      data<-data[which(grepl("text.value",colnames(data)))]
      colnames(data)<-gsub("children.","",colnames(data))
      colnames(data)<-gsub(".text.value","",colnames(data))
      colnames(data)<-gsub("address.","",colnames(data))
      colnames(data)<-gsub("links.","",colnames(data))
      colnames(data)<-gsub(".amount","",colnames(data))
      data <- data[,c("zpid","mapthishome","street","zipcode","city","state","latitude","longitude")]
      data$zestimate <- ifelse("zestimate" %in% colnames(data), data$zestimate, NA)
      data$rentzestimate <- ifelse("rentzestimate" %in% colnames(data), data$rentzestimate, NA)
      data[is.na(data)] <- 0
      Out_df<-data.frame("Address"=df[i,"Address"],data)
      Out_df_All<-rbind(Out_df,Out_df_All)

    }
  }
}
View(Out_df_All)

enter image description here

Shirin Yavari
  • 626
  • 4
  • 6