## Unit root testing: library(Rmetrics) help(UnitrootTests) gdp=read.csv("GDPannualreal.csv",header=T) gdp.return=returns(gdp$VALUE)[-1] adfTest(gdp.return,lags=5) ## We assume AR(5) Title: Augmented Dickey-Fuller Test Test Results: PARAMETER: Lag Order: 5 STATISTIC: Dickey-Fuller: -1.5463 P VALUE: 0.1203 ## Seems that the null hypothesis of existence of a unit root cannot be rejected Description: Mon Jun 16 11:51:14 2008 by user: Ion ### LOOK AT: adfTest(log(gdp$VALUE),lags=4) ## What does this do and what does the result mean? ## A better analysis (more caferful). ord=ar(gdp.return) > ord Call: ar(x = gdp.return) Coefficients: 1 2 3 4 5 0.5495 0.0237 -0.1332 -0.2981 0.1827 Order selected 5 sigma^2 estimated as 0.001619 ## Seems that an order 5 is estimated adfTest(gdp.return,lags=5) ## same as we did before ## What about the difference series ar(diff(gdp.return)) ## also 5 adfTest(diff(gdp.return),lags=5) ## This gives a p-value much smaller that means that the unit root is rejected for the difference series. Means that the I in ARIMA is only 1 ############# #SEASONAL time series: jnj.data=read.csv("JNJmonthlySeasonal.csv",header=T) ## jnj.data=read.csv("BritishAIRwaysmonthlySeasonal.csv",header=T) jnj.prices= jnj.data$Adj.Close[length(jnj.data$Adj.Close):1] ## recall that data from yahoo is presented with the most recent date on top plot(jnj.prices,type="l") ## Plot of data. note the huge increase in share value - cannot see anything jnj.logprices=log(jnj.prices) plot(jnj.logprices,type="l") ## this is to remove the exponential increase and see better points(jnj.logprices,pch=16)## can't really see much plot(jnj.logprices[1:100],type="l") points(jnj.logprices[1:100],pch=16,col="red") acf(jnj.logprices,lag.max=16) jnj.return=diff(jnj.logprices) acf(jnj.return,lag.max=16) m1=arima(jnj.return,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12)) m1 tsdiag(m1) # Model checking Box.test(m1$resid,lag=10,type='Ljung') ## recall H0 means uncorrelated resid (good) f1=predict(m1,8) # prediction names(f1) f1 ##Book looks at Quaterly data so let us do that too: jnj.prices= jnj.data$Adj.Close[length(jnj.data$Adj.Close):1] quaterly.sequence=seq(1,length(jnj.prices),by=3) jnj.prices=jnj.prices[quaterly.sequence] ## Now repeat the code above for this new data. jnj.logprices=log(jnj.prices) plot(jnj.logprices,type="l") points(jnj.logprices,pch=16) acf(jnj.logprices,lag.max=16) ## figure 2.14.a) page 74 jnj.return=diff(jnj.logprices) acf(jnj.return,lag.max=16) ## similar with 2.14.b) ## seasonal difference: sjnj.logprices= diff(jnj.logprices,4) acf(sjnj.logprices,lag.max=16) ## Plot of 2.14 c) sjnj.return=diff(jnj.return,4) ## yearly = 4 quarters acf(sjnj.return,lag.max=16) ## Plot of 2.14 d) ## last shows negative at lag 4 and positive at lag 12 ##define the airline model: m1=arima(jnj.return,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=4)) m1 tsdiag(m1) # Model checking f1=predict(m1,8) # prediction for 2 years names(f1) f1 ## Let us see how good the model is: construct the model without the last 8 observations then predict them: m2=arima(jnj.return[1:(length(jnj.return)-8)],order=c(0,1,1),seasonal=list(order=c(0,1,1),period=4)) f2=predict(m2,8) plot((length(jnj.return)-20):length(jnj.return),jnj.return[(length(jnj.return)-20):length(jnj.return)],type="l") points((length(jnj.return)-7):length(jnj.return),f2$pred,col="red") lines((length(jnj.return)-7):length(jnj.return),f2$pred+1.96*f2$se,col="green") lines((length(jnj.return)-7):length(jnj.return),f2$pred-1.96*f2$se,col="green") ## TRY it yourself with your own seasonal time series ##************************************ ## For this part we will set apart 12 months from the data, build models based on the inital kept data and see how well they forecast. #Other data: ## From federal reserves monthly value of Total Consumer Loans Owned by Commercial Banks in Billions from 1942 to present comercialloan=read.csv("FRB_G19.csv", skip=5,header=T) names(comercialloan) plot(comercialloan[,1],comercialloan[,2],type="l") plot(comercialloan[,1],log(comercialloan[,2]),type="l") plot(comercialloan[,1],returns(comercialloan[,2]),col="red") lines(comercialloan[,1],returns(comercialloan[,2])) #Unit root testing: ar(comercialloan[,2]) adfTest(comercialloan[,2],lags=1) ## clearly has a unit root as indicated by the coeff in the AR model as well ar(log(comercialloan[,2])) adfTest(log(comercialloan[,2]),lags=1) ## same story #Now work with the returns which should eliminate the unit root if there is only one. comercialloanreturns=returns(comercialloan[,2])[-1] ar(comercialloanreturns) adfTest(comercialloanreturns,lags=25) ## pvalue is small indicating that there is no unit root anymore. #usual plots acf(comercialloanreturns,lag.max=16) pacf(comercialloanreturns,lag.max=16) ###### NOW model building. Create data for model building and forecasting: cloan2008=comercialloanreturns[(length(comercialloanreturns)-11):length(comercialloanreturns)] comercialloanreturns.full=comercialloanreturns comercialloanreturns = comercialloanreturns[1:(length(comercialloanreturns)-12)] ar(comercialloanreturns) adfTest(comercialloanreturns,lags=25) ## So let us try a ARMA model for this (previous one was an AR. ## determine order: library(tseries) eacf.tables(comercialloanreturns) ## Not really clear #ANOTHER version of eacf is already codded in TSA package: library(TSA) eacf(comercialloanreturns) ## using AIC: max.p=15 max.q=15 model.aic=matrix(,nrow=max.p+1,ncol=max.q+1) for (p in 0:max.p){ for (q in 0:max.q){ model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic } } ## takes forever print(model.aic) [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [1,] -4579.610 -4810.313 -4896.645 -4894.656 -4941.883 -4949.437 -4958.285 -4957.564 -4965.640 -4970.253 -5012.913 -5017.241 -5077.807 -5116.024 -5123.726 -5122.169 [2,] -4919.428 -4927.051 -4962.512 -4966.046 -4990.833 -4994.029 -4992.348 -5005.798 -5012.557 -5010.634 -5027.899 -5025.900 -5114.947 -5120.824 -5115.721 -5120.130 [3,] -4921.476 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [4,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [5,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [6,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [7,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [8,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [9,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [10,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [11,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [12,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [13,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [14,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA [15,] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA #looking at this table let us try several more models in the range p+1=3:15 q+1=6:15 for (p in 2:max.p){ for (q in 7:max.q){ model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic } } ## takes forever for (p in 3:max.p){ for (q in 11:max.q){ model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic } } model.aic[is.na(model.aic)]=10000 # to eliminate the NA values and replace them with a large value which.min(model.aic) ## gives the index of the element in the matrix as if it were a long vector which(model.aic==min(model.aic),arr.ind=T) row col [1,] 3 10 ## Seems that an ARMA(2,9) would be appropriate ## An MA 14 seems also to have low AIC ## A better model1= arima(comercialloanreturns,order=c(25,0,0)) model2= arima(comercialloanreturns,order=c(0,0,14)) model3= arima(comercialloanreturns,order=c(2,0,9)) ## What about seasonality? Since this is monthly data perhaps we can try an airplane type model with 12 seasonal lag: model4= arima(comercialloanreturns,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12)) #NOW let us compare the three: plot((length(comercialloanreturns.full)-20):length(comercialloanreturns.full),comercialloanreturns.full[(length(comercialloanreturns.full)-20):length(comercialloanreturns.full)],type="l") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model1,12)$pred,col="green") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model2,12)$pred,col="blue") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model3,12)$pred,col="red") lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model4,12)$pred,col="magenta") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model1,12)$pred,col="green") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model2,12)$pred,col="blue") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model3,12)$pred,col="red") points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model4,12)$pred,col="magenta") ## What about errors? #model1: sum((cloan2008-predict(model1,12)$pred)^2) [1] 0.0006412673 sum((cloan2008-predict(model2,12)$pred)^2) [1] 0.00053344 sum((cloan2008-predict(model3,12)$pred)^2) [1] 0.000849362 sum((cloan2008-predict(model4,12)$pred)^2) [1] 0.000775068 ### All small. Usually in such a case the more parcimonius model is chosen.