3

I´m trying to code a function in R (stats programming language) that would allow me to automate the calculation of a linear regression (lm)

The problem: The regression is calculated through the "step" function, so the coefficients selected cannot be known in advance. Problem

  1. Automate identifying the coefficients selected by the step function.

  2. Vlookup and cross multiply the second column of the results Ex."View(OpenCoefs)" (estimates) with the last row(last day) of respective columns of the original data frame "sp"

The desirable solution would be a function that i would just type "run()" that would return the "y"s for each regression, namely, the forecast of the S&P500 index for the following day(Open, Low, High,Close).

The code retrieves data from the yahoo finance website, so it´s operational if you run it.

Here´s the code.

sp <- read.csv(paste("http://ichart.finance.yahoo.com/table.csv?s=%5EGSPC&a=03&b=1&c=1940&d=03&e=1&f=2014&g=d&ignore=.csv"))

sp$Adj.Close<-NULL

sp<-sp[nrow(sp):1,]

sp<-as.data.frame(sp)


for ( i in 2:nrow( sp ) ) {
sp[ i , "Gr_Open" ] <-
    ( sp[ i , "Open" ] / sp[ i - 1 , "Open" ] ) - 1       
} 


for ( i in 2:nrow( sp ) ) {
sp[ i , "Gr_High" ] <-
    ( sp[ i , "High" ] / sp[ i - 1 , "High" ] ) - 1       
} 


for ( i in 2:nrow( sp ) ) {
sp[ i , "Gr_Low" ] <-
    ( sp[ i , "Low" ] / sp[ i - 1 , "Low" ] ) - 1       
} 


for ( i in 2:nrow( sp ) ) {
sp[ i , "Gr_Close" ] <-
    ( sp[ i , "Close" ] / sp[ i - 1 , "Close" ] ) - 1       
} 


for ( i in 2:nrow( sp ) ) {
sp[ i , "Gr_Volume" ] <-
    ( sp[ i , "Volume" ] / sp[ i - 1 , "Volume" ] ) - 1       
} 

nRows_in_sp<-1:nrow(sp)

sp<-cbind(sp,nRows_in_sp)


Open_Rollin<-NA

sp<-cbind(sp,Open_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "Open_Rollin" ]<-0 
} else {
sp[ i , "Open_Rollin" ]<-(( mean(sp[,"Open"][(i-100):i])))
}
}


Close_Rollin<-NA

nRows_in_sp<-1:nrow(sp)

sp<-cbind(sp,Close_Rollin)

for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , " Close_Rollin" ]<-0 
} else {
sp[ i , "Close_Rollin" ]<-(( mean(sp[,"Close"][(i-100):i])))
}
}



Low_Rollin<-NA

sp<-cbind(sp,Low_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "Low_Rollin" ]<-0 
} else {
sp[ i , "Low_Rollin" ]<-(( mean(sp[,"Low"][(i-100):i])))
}
}


High_Rollin<-NA

sp<-cbind(sp,High_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "High_Rollin" ]<-0 
} else {
sp[ i , "High_Rollin" ]<-(( mean(sp[,"High"][(i-100):i])))
}
}


Open_GR_Rollin<-NA

sp<-cbind(sp,Open_GR_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "Open_GR_Rollin" ]<-0 
} else {
sp[ i , "Open_GR_Rollin" ]<-(( mean(sp[,"Gr_Open"][(i-100):i])))
}
}



Close_GR_Rollin<-NA

sp<-cbind(sp, Close_GR_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "Close_GR_Rollin" ]<-0 
} else {
sp[ i , "Close_GR_Rollin" ]<-(( mean(sp[,"Gr_Close"][(i-100):i])))
}
}



Low_GR_Rollin<-NA

sp<-cbind(sp, Low_GR_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "Low_GR_Rollin" ]<-0 
} else {
sp[ i , "Low_GR_Rollin" ]<-(( mean(sp[,"Gr_Low"][(i-100):i])))
}
}


High_GR_Rollin<-NA

sp<-cbind(sp, High_GR_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]<=1000)
{
sp[ i , "High_GR_Rollin" ]<-0 
} else {
sp[ i , "High_GR_Rollin" ]<-(( mean(sp[,"Gr_High"][(i-100):i])))
}
}


Open_SD_Rollin<-NA

sp<-cbind(sp,Open_SD_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]>100)
{
sp[ i, "Open_SD_Rollin" ] <- sd(sp[,"Open"][(i-100):i])
} 
}



Close_SD_Rollin<-NA

sp<-cbind(sp, Close_SD_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]>100)
{
sp[ i, "Close_SD_Rollin" ] <- sd(sp[,"Close"][(i-100):i])
} 
}


Low_SD_Rollin<-NA

sp<-cbind(sp, Low_SD_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]>100)
{
sp[ i, "Low_SD_Rollin" ] <- sd(sp[,"Low"][(i-100):i])
} 
}



High_SD_Rollin<-NA

sp<-cbind(sp, High_SD_Rollin)
for ( i in 2:nrow( sp ) ) {
if(sp[i,"nRows_in_sp"]>100)
{
sp[ i, "High_SD_Rollin" ] <- sd(sp[,"High"][(i-100):i])
} 
}


N <- length(sp[,"Open"])



Openlag <- c(NA, sp[,"Open"][1:(N-1)])
sp<-cbind(sp,Openlag)

Highlag <- c(NA, sp[,"High"][1:(N-1)])

sp<-cbind(sp,Highlag)

Lowlag <- c(NA, sp[,"Low"][1:(N-1)])

sp<-cbind(sp,Lowlag)

Closelag <- c(NA, sp[,"Close"][1:(N-1)])

sp<-cbind(sp,Closelag)


Gr_Openlag <- c(NA, sp[,"Gr_Open"][1:(N-1)])

sp<-cbind(sp,Gr_Openlag)

Gr_Highlag <- c(NA, sp[,"Gr_High"][1:(N-1)])

sp<-cbind(sp,Gr_Highlag)

Gr_Lowlag <- c(NA, sp[,"Gr_Low"][1:(N-1)])

sp<-cbind(sp,Gr_Lowlag)

Gr_Closelag <- c(NA, sp[,"Gr_Close"][1:(N-1)])

sp<-cbind(sp,Gr_Closelag)

Gr_Volumelag <- c(NA, sp[,"Gr_Volume"][1:(N-1)])

sp<-cbind(sp,Gr_Volumelag)



Open_GR_Rollinlag <- c(NA, sp[,"Open_GR_Rollin"][1:(N-1)])

sp<-cbind(sp, Open_GR_Rollinlag)

Low_GR_Rollinlag <- c(NA, sp[,"Low_GR_Rollin"][1:(N-1)])

sp<-cbind(sp, Low_GR_Rollinlag)

High_GR_Rollinlag <- c(NA, sp[,"High_GR_Rollin"][1:(N-1)])
sp<-cbind(sp, High_GR_Rollinlag)

Close_GR_Rollinlag <- c(NA, sp[,"Close_GR_Rollin"][1:(N-1)])

sp<-cbind(sp, Close_GR_Rollinlag)


Open_SD_Rollinlag <- c(NA, sp[,"Open_SD_Rollin"][1:(N-1)])

sp<-cbind(sp, Open_SD_Rollinlag)

Low_SD_Rollinlag <- c(NA, sp[,"Low_SD_Rollin"][1:(N-1)])

sp<-cbind(sp, Low_SD_Rollinlag)

High_SD_Rollinlag <- c(NA, sp[,"High_SD_Rollin"][1:(N-1)])

sp<-cbind(sp, High_SD_Rollinlag)

Close_SD_Rollinlag <- c(NA, sp[,"Close_SD_Rollin"][1:(N-1)])

sp<-cbind(sp, Close_SD_Rollinlag)




OpenCoefs<-coefficients(summary(step(lm(sp[,"Open"] ~ Openlag + Lowlag + Highlag + Closelag + Gr_Openlag + Gr_Lowlag + Gr_Highlag + Gr_Closelag + Gr_Volumelag + Open_GR_Rollinlag + Low_GR_Rollinlag + High_GR_Rollinlag + Close_GR_Rollinlag + Open_SD_Rollinlag + Low_SD_Rollinlag + High_SD_Rollinlag + Close_SD_Rollinlag),direction="both",test="F")))


LowCoefs<-coefficients(summary(step(lm(sp[,"Low"] ~ Openlag + Lowlag + Highlag + Closelag + Gr_Openlag + Gr_Lowlag + Gr_Highlag + Gr_Closelag + Gr_Volumelag + Open_GR_Rollinlag + Low_GR_Rollinlag + High_GR_Rollinlag + Close_GR_Rollinlag + Open_SD_Rollinlag + Low_SD_Rollinlag + High_SD_Rollinlag + Close_SD_Rollinlag),direction="both",test="F")))


HighCoefs<-coefficients(summary(step(lm(sp[,"High"] ~ Openlag + Lowlag + Highlag + Closelag + Gr_Openlag + Gr_Lowlag + Gr_Highlag + Gr_Closelag + Gr_Volumelag + Open_GR_Rollinlag + Low_GR_Rollinlag + High_GR_Rollinlag + Close_GR_Rollinlag + Open_SD_Rollinlag + Low_SD_Rollinlag + High_SD_Rollinlag + Close_SD_Rollinlag),direction="both",test="F")))


CloseCoefs<-coefficients(summary(step(lm(sp[,"Close"] ~ Openlag + Lowlag + Highlag + Closelag + Gr_Openlag + Gr_Lowlag + Gr_Highlag + Gr_Closelag + Gr_Volumelag + Open_GR_Rollinlag + Low_GR_Rollinlag + High_GR_Rollinlag + Close_GR_Rollinlag + Open_SD_Rollinlag + Low_SD_Rollinlag + High_SD_Rollinlag + Close_SD_Rollinlag),direction="both",test="F")))


View(OpenCoefs)

View(LowCoefs)

View(HighCoefs)

View(CloseCoefs)

View(sp)
Joshua Ulrich
  • 173,410
  • 32
  • 338
  • 418
Pedro9
  • 186
  • 1
  • 11
  • 1
    Look at the `predict` function. It will give what a model will evaluate (predict) for a given set of inputs. If you just want to predict for the last row, use `newdata=sp[nrow(sp),]`. – Brian Diggs Feb 04 '13 at 20:42

1 Answers1

6

Your code is so bad, I had to take pity on you. :) Here's a refactored version of your code:

library(quantmod)
sp <- getSymbols("^GSPC", auto.assign=FALSE)
sp$GSPC.Adjusted <- NULL
colnames(sp) <- gsub("^GSPC\\.","",colnames(sp))

sp$Gr_Open   <- ROC(Op(sp), type="discrete")
sp$Gr_High   <- ROC(Hi(sp), type="discrete")
sp$Gr_Low    <- ROC(Lo(sp), type="discrete")
sp$Gr_Close  <- ROC(Cl(sp), type="discrete")
sp$Gr_Volume <- ROC(Vo(sp), type="discrete")

N <- 100
sp$Open_Rollin  <- runMean(sp$Open, N)
sp$High_Rollin  <- runMean(sp$High, N)
sp$Low_Rollin   <- runMean(sp$Low, N)
sp$Close_Rollin <- runMean(sp$Close, N)

sp$Open_GR_Rollin  <- runMean(sp$Gr_Open, N)
sp$High_GR_Rollin  <- runMean(sp$Gr_High, N)
sp$Low_GR_Rollin   <- runMean(sp$Gr_Low, N)
sp$Close_GR_Rollin <- runMean(sp$Gr_Close, N)

sp$Open_SD_Rollin  <- runSD(sp$Open, N)
sp$High_SD_Rollin  <- runSD(sp$High, N)
sp$Low_SD_Rollin   <- runSD(sp$Low, N)
sp$Close_SD_Rollin <- runSD(sp$Close, N)

spLag <- lag(sp)
colnames(spLag) <- paste(colnames(sp),"lag",sep="")
sp <- na.omit(merge(sp, spLag))

There's no need to answer your first question in order to answer your second question. You don't have to cross-multiply coefficients with data by hand. You can simply access the fitted values from the model. That requires that you preserve the model though...

f <- Open ~ Openlag + Lowlag + Highlag + Closelag +
  Gr_Openlag + Gr_Lowlag + Gr_Highlag + Gr_Closelag + Gr_Volumelag +
  Open_GR_Rollinlag + Low_GR_Rollinlag + High_GR_Rollinlag + Close_GR_Rollinlag +
  Open_SD_Rollinlag + Low_SD_Rollinlag + High_SD_Rollinlag + Close_SD_Rollinlag

OpenLM <- lm(f, data=sp)
HighLM <- update(OpenLM, High ~ .)
LowLM <- update(OpenLM, Low ~ .)
CloseLM <- update(OpenLM, Close ~ .)

OpenStep <- step(OpenLM,direction="both",test="F")
HighStep <- step(HighLM,direction="both",test="F")
LowStep <- step(LowLM,direction="both",test="F")
CloseStep <- step(CloseLM,direction="both",test="F")

tail(fitted(OpenStep),1)
# 2013-02-01 
#    1497.91 
tail(fitted(HighStep),1)
# 2013-02-01 
#    1504.02 
tail(fitted(LowStep),1)
# 2013-02-01 
#   1491.934 
tail(fitted(CloseStep),1)
# 2013-02-01 
#   1499.851
Joshua Ulrich
  • 173,410
  • 32
  • 338
  • 418
  • In my inital post, I mentioned that I was a noob to programming and quantitative analysis. It was edited out :). I appreciate the tip about the predict function, but it´s about the journey. I´m sure I´ll learn alot from your comments. Thanks in advance – Pedro9 Feb 04 '13 at 20:56
  • @Pedro9 -- Joshua could see how hard you're working at this, which I'm sure is why he was willing to help. Welcome to SO, and enjoy the journey! – Josh O'Brien Feb 04 '13 at 22:05
  • @Pedro9: Josh O'Brien is correct. I could see that you really put forth some effort, so I wanted to show you how I would do it. – Joshua Ulrich Feb 04 '13 at 22:19