0

I am trying to use R vest to webscrape the NASDAQ closing dates for the last 3 months so I can play around with the data.

Problem being I cant seem to find the correct xpath for it to return the table. I've tried quite a few using chrome's 'inspect element' to find xpaths as well as 'SelectorGadget' plug-in for chrome.

It seems most people have done this with python but I am much more comfortable in R and specifically using R vest for web scraping so i'm hoping i'm not alone!

I've posted my code below. I believe the problem is in identifying the xpath. Here is an example of one of the webpages...http://finance.yahoo.com/q/hp?s=CSV

After I get one to work I hope to put it in a loop which is below my problem code....

Thank you!

library("rvest")
library("data.table")
library("xlsx")


#Problem Code

company <- 'CSV'
url <- paste("http://finance.yahoo.com/q/hp?s=",toString(company),sep="")
url <-html(url)
select_table <- '//table' #this is the line I think is incorrect
fnames <- html_nodes(url, xpath=select_table) %>% html_table(fill=TRUE)
STOCK <- fnames[[1]]
STOCKS <- rbind(STOCK, STOCKS) 



#---------------------------------------------------------------------
#Loop for use later

companylist <- read.csv('companylist.csv') #this is a list of all company tickers in the NASDAQ
STOCK <- data.frame()
STOCKS <- data.frame(Date=character(),Open=character(),High=character(),Low=character(),Close=character(),Volume=character(), AdjClose=character())
for (i in 1:3095) {
  company <- companylist[i,1]
  url <- paste("http://finance.yahoo.com/q/hp?s=",toString(company),sep="")
  url <-html(url)
  select_table <- '//*[@id="yfncsumtab"]/tbody/tr[2]/td[1]/table[4]'
  fnames <- html_nodes(url,xpath = select_table) %>% html_table(fill=TRUE)
  STOCK <- fnames[[1]]
  STOCKS <- rbind(STOCK, STOCKS) 

}
View(STOCKS)
bpheazye
  • 146
  • 1
  • 16
  • 4
    if your goal is only to have the prices have a look at the `quantmod`package which allows you to request a lot of data. – etienne Nov 02 '15 at 17:01
  • @etienne That is exactly what I was looking for. Wish I knew about that package before! Thanks. – bpheazye Nov 02 '15 at 17:18

1 Answers1

1

Do you want to grab stock prices?

https://gist.github.com/jaehyeon-kim/356cf62b61248193db25#file-downloadstockdata

# assumes codes are known beforehand
codes <- c("ABT",   "ABBV", "ACE",  "ACN",  "ACT",  "ADBE", "ADT",  "AES",  "AET",  "AFL",  "AMG",  "A",    "GAS",  "APD",  "ARG",  "AKAM", "AA")
urls <- paste0("http://www.google.com/finance/historical?q=NASDAQ:",
codes,"&output=csv")
paths <- paste0(codes,"csv")
missing <- !(paths %in% dir(".", full.name = TRUE))
missing

# simple error handling in case file doesn't exists
downloadFile <- function(url, path, ...) {
# remove file if exists already
if(file.exists(path)) file.remove(path)
# download file
tryCatch(
download.file(url, path, ...), error = function(c) {
# remove file if error
if(file.exists(path)) file.remove(path)
# create error message
c$message <- paste(substr(path, 1, 4),"failed")
message(c$message)
}
)
}
# wrapper of mapply
Map(downloadFile, urls[missing], paths[missing])

You can try this as well . . .

library(knitr)
library(lubridate)
library(stringr)
library(plyr)
library(dplyr)
{% endhighlight %}

The script begins with creating a folder to save data files.


{% highlight r %}
# create data folder
dataDir <- paste0("data","_","2014-11-20-Download-Stock-Data-1")
if(file.exists(dataDir)) { 
      unlink(dataDir, recursive = TRUE)
      dir.create(dataDir)
} else {
      dir.create(dataDir)
}
{% endhighlight %}

After creating urls and file paths, files are downloaded using `Map` function - it is a warpper of `mapply`. Note that, in case the function breaks by an error (eg when a file doesn't exist), `download.file` is wrapped by another function that includes an error handler (`tryCatch`). 


{% highlight r %}
# assumes codes are known beforehand
codes <- c("MSFT", "TCHC") # codes <- c("MSFT", "1234") for testing
urls <- paste0("http://www.google.com/finance/historical?q=NASDAQ:",
               codes,"&output=csv")
paths <- paste0(dataDir,"/",codes,".csv") # back slash on windows (\\)

# simple error handling in case file doesn't exists
downloadFile <- function(url, path, ...) {
      # remove file if exists already
      if(file.exists(path)) file.remove(path)
      # download file
      tryCatch(            
            download.file(url, path, ...), error = function(c) {
                  # remove file if error
                  if(file.exists(path)) file.remove(path)
                  # create error message
                  c$message <- paste(substr(path, 1, 4),"failed")
                  message(c$message)
            }
      )
}
# wrapper of mapply
Map(downloadFile, urls, paths)
{% endhighlight %}


Finally files are read back using `llply` and they are combined using `rbind_all`. Note that, as the merged data has multiple stocks' records, `Code` column is created.



{% highlight r %}
# read all csv files and merge
files <- dir(dataDir, full.name = TRUE)
dataList <- llply(files, function(file){
      data <- read.csv(file, stringsAsFactors = FALSE)
      # get code from file path
      pattern <- "/[A-Z][A-Z][A-Z][A-Z]"
      code <- substr(str_extract(file, pattern), 2, nchar(str_extract(file, pattern)))
      # first column's name is funny
      names(data) <- c("Date","Open","High","Low","Close","Volume")
      data$Date <- dmy(data$Date)
      data$Open <- as.numeric(data$Open)
      data$High <- as.numeric(data$High)
      data$Low <- as.numeric(data$Low)
      data$Close <- as.numeric(data$Close)
      data$Volume <- as.integer(data$Volume)
      data$Code <- code
      data
}, .progress = "text")

data <- rbind_all(dataList)
{% endhighlight %}
ASH
  • 20,759
  • 19
  • 87
  • 200
  • Any idea how I can add to this code to select a specific range of dates? The website has the ability to choose dates but im not sure how to alter that via code. Thanks for your help! – bpheazye Dec 07 '15 at 04:41