## Unit root testing: library(fUnitRoots) help(UnitrootTests) gdp=read.csv("NIPATableGDPannual.csv",header=T) gdp[1,] gdp=read.csv("NIPATableGDPannual.csv",skip=4, header=F) gdp gdp.annual=gdp[2,3:length(gdp[2,])] gdp.annual gdp.annual=t(gdp.annual) gdp.annual ### let us keep the year too gdp.annual=gdp[1:2,3:length(gdp[2,])] gdp.annual gdp.annual=t(gdp.annual) gdp.annual plot(gdp.annual[,1], gdp.annual[,2]) gdp.return=returns(gdp.annual[,2])[-1] ### Annual GDP: adfTest(gdp.return,lags=5) ## We assume AR(5) Title: Augmented Dickey-Fuller Test Test Results: PARAMETER: Lag Order: 5 STATISTIC: Dickey-Fuller: -0.8671 P VALUE: 0.3364 ## Seems that the null hypothesis of existence of a unit root cannot be rejected Description: Mon Jun 22 14:53:45 2009 by user: ion ### LOOK AT: adfTest(log(gdp$VALUE),lags=4) ## What does this do and what does the result mean? ## A better analysis (more caferful). ord=ar(gdp.return) ord > ord Call: ar(x = gdp.return) Coefficients: 1 2 0.7493 -0.2455 Order selected 2 sigma^2 estimated as 0.003028 ## Seems that an order 2 is estimated adfTest(gdp.return,lags=2) ## same as we did before ## What about the difference series ar(diff(gdp.return)) ## 5 adfTest(diff(gdp.return),lags=5) ## This gives a p-value much smaller that means that the unit root is rejected for the difference series. Means that the I in ARIMA is only 1 ############# #SEASONAL time series: jnj.data=read.csv("JNJmonthlySeasonal.csv",header=T) ## JNJ is Johnson and Johnson and data is monthly ## jnj.data=read.csv("BritishAIRwaysmonthlySeasonal.csv",header=T) jnj.prices= jnj.data$Adj.Close[length(jnj.data$Adj.Close):1] ## recall that data from yahoo is presented with the most recent date on top plot(jnj.prices,type="l") ## Plot of data. note the huge increase in share value - cannot see anything jnj.logprices=log(jnj.prices) plot(jnj.logprices,type="l") ## this is to remove the exponential increase and see better points(jnj.logprices,pch=16)## can't really see much plot(jnj.logprices[1:100],type="l") points(jnj.logprices[1:100],pch=16,col="red") acf(jnj.logprices,lag.max=16) ## The airplane model jnj.return=diff(jnj.logprices) acf(jnj.return,lag.max=16) m1=arima(jnj.return,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12)) m1 tsdiag(m1) # Model checking Box.test(m1$resid,lag=10,type='Ljung') ## recall H0 means uncorrelated resid. Having uncorrelated residuals is good f1=predict(m1,8) # prediction names(f1) f1 ##Book looks at Quaterly data so let us do that too: jnj.prices= jnj.data$Adj.Close[length(jnj.data$Adj.Close):1] quaterly.sequence=seq(1,length(jnj.prices),by=3) jnj.prices=jnj.prices[quaterly.sequence] ## Now repeat the code above for this new data. jnj.logprices=log(jnj.prices) plot(jnj.logprices,type="l") points(jnj.logprices,pch=16) acf(jnj.logprices,lag.max=16) ## figure 2.14.a) page 74 jnj.return=diff(jnj.logprices) acf(jnj.return,lag.max=16) ## similar with 2.14.b) ## seasonal difference: sjnj.logprices= diff(jnj.logprices,4) acf(sjnj.logprices,lag.max=16) ## Plot of 2.14 c) sjnj.return=diff(jnj.return,4) ## yearly = 4 quarters acf(sjnj.return,lag.max=16) ## Plot of 2.14 d) ## last shows negative at lag 4 and positive at lag 12 ##define the airline model: m1=arima(jnj.return,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=4)) m1 tsdiag(m1) # Model checking f1=predict(m1,8) # prediction for 2 years names(f1) f1 ## Let us see how good the model is: construct the model without the last 8 observations then predict them: m2=arima(jnj.return[1:(length(jnj.return)-8)],order=c(0,1,1),seasonal=list(order=c(0,1,1),period=4)) f2=predict(m2,8) plot((length(jnj.return)-20):length(jnj.return),jnj.return[(length(jnj.return)-20):length(jnj.return)],type="l") ## We need to add this: ylim=c(-0.2,0.2) to actually see the confidence limits. points((length(jnj.return)-7):length(jnj.return),f2$pred,col="red") lines((length(jnj.return)-7):length(jnj.return),f2$pred+1.96*f2$se,col="green") lines((length(jnj.return)-7):length(jnj.return),f2$pred-1.96*f2$se,col="green") ## TRY it yourself with your own seasonal time series ##************************************ ## For this part we will set apart 12 months from the data, build models based on the inital kept data and see how well they forecast. #Other data: ## From federal reserves monthly value of Total Consumer Loans Owned by Commercial Banks in Billions from 1942 to present comercialloan=read.csv("frb_g19.csv", skip=4,header=T) names(comercialloan) ##Constructing the date comercialloan.year = c(paste("19",comercialloan$Year[1:684], sep=""), paste("200",comercialloan$Year[685:length(comercialloan$Year)], sep="")) x=paste(comercialloan$Month, comercialloan.year,sep="/") ## It actually needs the day too (idiot) x=paste(1,x,sep="/") comercialloan.date= strptime(x, format="%d/%b/%Y") plot(comercialloan.date,comercialloan$Total,type="l") plot(comercialloan.date,log(comercialloan$Total),type="l") plot(comercialloan.date,returns(comercialloan$Total),col="red") lines(comercialloan.date,returns(comercialloan$Total)) #Unit root testing: ar(comercialloan$Total) adfTest(comercialloan$Total,lags=1) ## clearly has a unit root as indicated by the coeff in the AR model as well ar(log(comercialloan$Total)) adfTest(log(comercialloan$Total),lags=1) ## same story #Now work with the returns which should eliminate the unit root if there is only one. comercialloanreturns=returns(comercialloan$Total)[-1] ar(comercialloanreturns) adfTest(comercialloanreturns,lags=25) ## pvalue is small indicating that there is no unit root anymore. #usual plots acf(comercialloanreturns,lag.max=16) pacf(comercialloanreturns,lag.max=16) ###### NOW model building. Create data for model building and forecasting: cloan2008=comercialloanreturns[(length(comercialloanreturns)-11):length(comercialloanreturns)] comercialloanreturns.full=comercialloanreturns comercialloanreturns = comercialloanreturns[1:(length(comercialloanreturns)-12)] ar(comercialloanreturns) adfTest(comercialloanreturns,lags=25) ## So let us try a ARMA model for this (previous one was an AR. ## determine order: library(tseries) eacf.tables(comercialloanreturns) ## Not really clear #ANOTHER version of eacf is already codded in TSA package: library(TSA) eacf(comercialloanreturns) ## using AIC: max.p=15 max.q=15 model.aic=matrix(,nrow=max.p+1,ncol=max.q+1) for (p in 0:max.p){ for (q in 0:max.q){ model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic } } ## takes forever print(model.aic) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [1,] -4657.262 -4890.967 -4976.687 -4974.805 -5021.694 -5030.672 -5039.359 -5038.321 -5047.222 -5049.435 -5095.592 -5099.314 -5157.729 -5203.573 -5211.352 -5209.732 [2,] -4997.330 -5003.260 -5040.721 -5044.258 -5071.176 -5074.251 -5072.388 -5086.325 -5094.289 -5092.493 -5109.636 -5107.674 -5199.325 -5208.412 -5202.920 -5207.669 [3,] -4998.554 -5049.896 -5040.889 -5107.332 -5073.407 -5072.306 -5089.550 -5094.937 -5138.237 -5228.122 -5107.774 -5144.472 -5207.987 -5207.512 -5204.709 -5206.364 [4,] -5009.659 -5045.046 -5044.077 -5074.932 -5072.976 -5071.194 -5098.436 -5208.589 -5138.920 NA NA NA NA NA NA NA [5,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [6,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [7,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [8,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [9,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [10,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [11,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [12,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [13,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [14,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [15,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [16,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA #looking at this table let us try several more models in the range p+1=3:15 q+1=6:15 for (p in 3:3){ for (q in 13:max.q){ model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic } } ## takes forever for (p in 5:max.p){ for (q in 7:max.q){ model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic } } print(model.aic) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [1,] -4657.262 -4890.967 -4976.687 -4974.805 -5021.694 -5030.672 -5039.359 -5038.321 -5047.222 -5049.435 -5095.592 -5099.314 -5157.729 -5203.573 -5211.352 -5209.732 [2,] -4997.330 -5003.260 -5040.721 -5044.258 -5071.176 -5074.251 -5072.388 -5086.325 -5094.289 -5092.493 -5109.636 -5107.674 -5199.325 -5208.412 -5202.920 -5207.669 [3,] -4998.554 -5049.896 -5040.889 -5107.332 -5073.407 -5072.306 -5089.550 -5094.937 -5138.237 -5228.122 -5107.774 -5144.472 -5207.987 -5207.512 -5204.709 -5206.364 [4,] -5009.659 -5045.046 -5044.077 -5074.932 -5072.976 -5071.194 -5098.436 -5208.589 -5138.920 NA NA -5142.733 NA -5205.544 NA NA [5,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [6,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [7,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [8,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [9,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [10,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [11,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [12,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [13,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [14,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [15,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [16,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA model.aic[is.na(model.aic)]=10000 # to eliminate the NA values and replace them with a large value print(model.aic) which.min(model.aic) ## gives the index of the element in the matrix as if it were a long vector which(model.aic==min(model.aic),arr.ind=T) row col [1,] 3 10 ## Seems that an ARMA(2,9) would be appropriate ## An MA 14 seems also to have low AIC ## A better model1= arima(comercialloanreturns,order=c(25,0,0)) model2= arima(comercialloanreturns,order=c(0,0,14)) model3= arima(comercialloanreturns,order=c(2,0,9)) ## What about seasonality? Since this is monthly data perhaps we can try an airplane type model with 12 seasonal lag: model4= arima(comercialloanreturns,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12)) #NOW let us compare the three: plot((length(comercialloanreturns.full)-20):length(comercialloanreturns.full),comercialloanreturns.full[(length(comercialloanreturns.full)-20):length(comercialloanreturns.full)],type="l") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model1,12)$pred,col="green") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model2,12)$pred,col="blue") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model3,12)$pred,col="red") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model4,12)$pred,col="magenta") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model1,12)$pred,col="green") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model2,12)$pred,col="blue") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model3,12)$pred,col="red") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model4,12)$pred,col="magenta") ## What about errors? sum((cloan2008-predict(model1,12)$pred)^2) [1] 0.001971101 #AR sum((cloan2008-predict(model2,12)$pred)^2) [1] 0.002397539 #MA sum((cloan2008-predict(model3,12)$pred)^2) [1] 0.003233350 #ARMA sum((cloan2008-predict(model4,12)$pred)^2) [1] 0.002007989 # Seasonal #last year data same models: #model1: #sum((cloan2008-predict(model1,12)$pred)^2) #[1] 0.0006412673 #AR #sum((cloan2008-predict(model2,12)$pred)^2) #[1] 0.00053344 #MA #sum((cloan2008-predict(model3,12)$pred)^2) #[1] 0.000849362 #ARMA #sum((cloan2008-predict(model4,12)$pred)^2) #[1] 0.000775068 #Seasonal ### All small. Usually in such a case the more parcimonius model is chosen.