## Unit root testing:
library(fUnitRoots)
help(UnitrootTests)


gdp=read.csv("NIPATableGDPannual.csv",header=T)
gdp[1,]

gdp=read.csv("NIPATableGDPannual.csv",skip=4, header=F)
gdp
gdp.annual=gdp[2,3:length(gdp[2,])]
gdp.annual
gdp.annual=t(gdp.annual)
gdp.annual

### let us keep the year too

gdp.annual=gdp[1:2,3:length(gdp[2,])]
gdp.annual
gdp.annual=t(gdp.annual)
gdp.annual

plot(gdp.annual[,1], gdp.annual[,2])
gdp.return=returns(gdp.annual[,2])[-1]

### Annual GDP:

adfTest(gdp.return,lags=5)  ## We assume AR(5)

Title:
 Augmented Dickey-Fuller Test

Test Results:
  PARAMETER:
    Lag Order: 5
  STATISTIC:
    Dickey-Fuller: -0.8671
  P VALUE:
    0.3364        ## Seems that the null hypothesis of existence of a unit root cannot be rejected

Description:
 Mon Jun 22 14:53:45 2009 by user: ion
           

### LOOK AT: adfTest(log(gdp$VALUE),lags=4) ## What does this do and what does the result mean?


## A better analysis (more caferful).

ord=ar(gdp.return)

ord

> ord

Call:
ar(x = gdp.return)

Coefficients:
      1        2  
 0.7493  -0.2455  

Order selected 2  sigma^2 estimated as  0.003028            ## Seems that an order 2 is estimated

adfTest(gdp.return,lags=2)            ## same as we did before

## What about the difference series

ar(diff(gdp.return))    ##  5

adfTest(diff(gdp.return),lags=5)    ## This gives a p-value much smaller that means that the unit root is rejected for the difference series. Means that the I in ARIMA is only 1

#############
#SEASONAL time series:

jnj.data=read.csv("JNJmonthlySeasonal.csv",header=T)  
## JNJ is Johnson and Johnson and data is monthly
## jnj.data=read.csv("BritishAIRwaysmonthlySeasonal.csv",header=T)

jnj.prices= jnj.data$Adj.Close[length(jnj.data$Adj.Close):1] ## recall that data from yahoo is presented with the most recent date on top

plot(jnj.prices,type="l") ## Plot of data. note the huge increase in share value - cannot see anything

jnj.logprices=log(jnj.prices)

plot(jnj.logprices,type="l")        ## this is to remove the exponential increase and see better
points(jnj.logprices,pch=16)## can't really see much

plot(jnj.logprices[1:100],type="l")
points(jnj.logprices[1:100],pch=16,col="red")

acf(jnj.logprices,lag.max=16)


## The airplane model
jnj.return=diff(jnj.logprices)
acf(jnj.return,lag.max=16)

m1=arima(jnj.return,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12))
m1

tsdiag(m1) # Model checking
Box.test(m1$resid,lag=10,type='Ljung')  ##  recall H0 means uncorrelated resid. Having uncorrelated residuals is good

f1=predict(m1,8) # prediction
names(f1)
f1

##Book looks at Quaterly data so let us do that too:
jnj.prices= jnj.data$Adj.Close[length(jnj.data$Adj.Close):1]
quaterly.sequence=seq(1,length(jnj.prices),by=3)

jnj.prices=jnj.prices[quaterly.sequence]

## Now repeat the code above for this new data.

jnj.logprices=log(jnj.prices)
plot(jnj.logprices,type="l")
points(jnj.logprices,pch=16)

acf(jnj.logprices,lag.max=16)   ## figure 2.14.a) page 74

jnj.return=diff(jnj.logprices)
acf(jnj.return,lag.max=16)     ## similar with 2.14.b)


## seasonal difference:
sjnj.logprices= diff(jnj.logprices,4)
acf(sjnj.logprices,lag.max=16)    ## Plot of 2.14 c)
sjnj.return=diff(jnj.return,4)    ## yearly = 4 quarters
acf(sjnj.return,lag.max=16)      ##  Plot of 2.14 d)
## last shows  negative at lag 4 and positive at lag 12

##define the airline model:
m1=arima(jnj.return,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=4))
m1

tsdiag(m1) # Model checking
f1=predict(m1,8) # prediction for 2 years
names(f1)
f1

## Let us see how good the model is: construct the model without the last 8 observations then predict them:
m2=arima(jnj.return[1:(length(jnj.return)-8)],order=c(0,1,1),seasonal=list(order=c(0,1,1),period=4))
f2=predict(m2,8)


plot((length(jnj.return)-20):length(jnj.return),jnj.return[(length(jnj.return)-20):length(jnj.return)],type="l") ## We need to add this: ylim=c(-0.2,0.2)  to actually see the confidence limits.
points((length(jnj.return)-7):length(jnj.return),f2$pred,col="red")
lines((length(jnj.return)-7):length(jnj.return),f2$pred+1.96*f2$se,col="green")
lines((length(jnj.return)-7):length(jnj.return),f2$pred-1.96*f2$se,col="green")

## TRY it yourself with your own seasonal time series

##************************************
## For this part we will set apart 12 months from the data, build models based on the inital kept data and see how well they forecast.

#Other data:
## From federal reserves monthly value of Total Consumer Loans Owned by Commercial Banks in Billions from 1942 to present
comercialloan=read.csv("frb_g19.csv", skip=4,header=T)
names(comercialloan)

##Constructing the date
comercialloan.year = c(paste("19",comercialloan$Year[1:684], sep=""), paste("200",comercialloan$Year[685:length(comercialloan$Year)], sep=""))
x=paste(comercialloan$Month, comercialloan.year,sep="/")       ## It actually needs the day too (idiot)
x=paste(1,x,sep="/")
comercialloan.date= strptime(x, format="%d/%b/%Y")

plot(comercialloan.date,comercialloan$Total,type="l")

plot(comercialloan.date,log(comercialloan$Total),type="l")
plot(comercialloan.date,returns(comercialloan$Total),col="red")
lines(comercialloan.date,returns(comercialloan$Total))

#Unit root testing:
ar(comercialloan$Total)
adfTest(comercialloan$Total,lags=1)   ## clearly has a unit root as indicated by the coeff in the AR model as well

ar(log(comercialloan$Total))
adfTest(log(comercialloan$Total),lags=1)    ## same story

#Now work with the returns which should eliminate the unit root if there is only one.
comercialloanreturns=returns(comercialloan$Total)[-1]
ar(comercialloanreturns)
adfTest(comercialloanreturns,lags=25)       ## pvalue is small indicating that there is no unit root anymore.

#usual plots
acf(comercialloanreturns,lag.max=16)
pacf(comercialloanreturns,lag.max=16)

###### NOW model building. Create data for model building and forecasting:
cloan2008=comercialloanreturns[(length(comercialloanreturns)-11):length(comercialloanreturns)]
comercialloanreturns.full=comercialloanreturns
comercialloanreturns = comercialloanreturns[1:(length(comercialloanreturns)-12)]
ar(comercialloanreturns)
adfTest(comercialloanreturns,lags=25)

## So let us try a ARMA model for this (previous one was an AR.
## determine order:
library(tseries)
eacf.tables(comercialloanreturns)  ## Not really clear 

#ANOTHER version of eacf is already codded in TSA package:
library(TSA)
eacf(comercialloanreturns)

## using AIC:

max.p=15
max.q=15
model.aic=matrix(,nrow=max.p+1,ncol=max.q+1)
for (p in 0:max.p){
  for (q in 0:max.q){
  model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic
  }
}                   ## takes forever
print(model.aic)
           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]      [,8]      [,9]     [,10]     [,11]     [,12]     [,13]     [,14]     [,15]     [,16]
 [1,] -4657.262 -4890.967 -4976.687 -4974.805 -5021.694 -5030.672 -5039.359 -5038.321 -5047.222 -5049.435 -5095.592 -5099.314 -5157.729 -5203.573 -5211.352 -5209.732
 [2,] -4997.330 -5003.260 -5040.721 -5044.258 -5071.176 -5074.251 -5072.388 -5086.325 -5094.289 -5092.493 -5109.636 -5107.674 -5199.325 -5208.412 -5202.920 -5207.669
 [3,] -4998.554 -5049.896 -5040.889 -5107.332 -5073.407 -5072.306 -5089.550 -5094.937 -5138.237 -5228.122 -5107.774 -5144.472 -5207.987 -5207.512 -5204.709 -5206.364
 [4,] -5009.659 -5045.046 -5044.077 -5074.932 -5072.976 -5071.194 -5098.436 -5208.589 -5138.920        NA        NA        NA        NA        NA        NA        NA
 [5,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [6,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [7,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [8,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [9,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[10,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[11,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[12,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[13,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[14,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[15,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[16,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA


#looking at this table let us try several more models in the range  p+1=3:15 q+1=6:15

for (p in 3:3){
  for (q in 13:max.q){
  model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic
  }
}                   ## takes forever

for (p in 5:max.p){
  for (q in 7:max.q){
  model.aic[p+1,q+1]= arima(comercialloanreturns,order=c(p,0,q))$aic
  }
}   

print(model.aic)
           [,1]      [,2]      [,3]      [,4]      [,5]      [,6]      [,7]      [,8]      [,9]     [,10]     [,11]     [,12]     [,13]     [,14]     [,15]     [,16]
 [1,] -4657.262 -4890.967 -4976.687 -4974.805 -5021.694 -5030.672 -5039.359 -5038.321 -5047.222 -5049.435 -5095.592 -5099.314 -5157.729 -5203.573 -5211.352 -5209.732
 [2,] -4997.330 -5003.260 -5040.721 -5044.258 -5071.176 -5074.251 -5072.388 -5086.325 -5094.289 -5092.493 -5109.636 -5107.674 -5199.325 -5208.412 -5202.920 -5207.669
 [3,] -4998.554 -5049.896 -5040.889 -5107.332 -5073.407 -5072.306 -5089.550 -5094.937 -5138.237 -5228.122 -5107.774 -5144.472 -5207.987 -5207.512 -5204.709 -5206.364
 [4,] -5009.659 -5045.046 -5044.077 -5074.932 -5072.976 -5071.194 -5098.436 -5208.589 -5138.920        NA        NA -5142.733        NA -5205.544        NA        NA
 [5,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [6,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [7,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [8,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
 [9,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[10,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[11,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[12,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[13,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[14,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[15,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA
[16,]        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA        NA


model.aic[is.na(model.aic)]=10000 # to eliminate the NA values and replace them with a large value
print(model.aic)
which.min(model.aic)  ## gives the index of the element in the matrix as if it were a long vector
which(model.aic==min(model.aic),arr.ind=T)

     row col
[1,]   3  10           ## Seems that an ARMA(2,9) would be appropriate  ## An MA 14 seems also to have low AIC

## A better 

model1= arima(comercialloanreturns,order=c(25,0,0))
model2= arima(comercialloanreturns,order=c(0,0,14))
model3= arima(comercialloanreturns,order=c(2,0,9))

## What about seasonality? Since this is monthly data perhaps we can try an airplane type model with 12 seasonal lag:

model4= arima(comercialloanreturns,order=c(0,1,1),seasonal=list(order=c(0,1,1),period=12))

#NOW let us compare the three:

plot((length(comercialloanreturns.full)-20):length(comercialloanreturns.full),comercialloanreturns.full[(length(comercialloanreturns.full)-20):length(comercialloanreturns.full)],type="l")
lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model1,12)$pred,col="green")
lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model2,12)$pred,col="blue")
lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model3,12)$pred,col="red")
lines((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model4,12)$pred,col="magenta")

points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model1,12)$pred,col="green")
points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model2,12)$pred,col="blue")
points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model3,12)$pred,col="red")
points((length(comercialloanreturns.full)-11):length(comercialloanreturns.full),predict(model4,12)$pred,col="magenta")

## What about errors?

 sum((cloan2008-predict(model1,12)$pred)^2)
[1] 0.001971101                   #AR
 sum((cloan2008-predict(model2,12)$pred)^2)
[1] 0.002397539                   #MA
 sum((cloan2008-predict(model3,12)$pred)^2)
[1] 0.003233350                   #ARMA
 sum((cloan2008-predict(model4,12)$pred)^2)
[1] 0.002007989                   # Seasonal

#last year data same models:
#model1:
#sum((cloan2008-predict(model1,12)$pred)^2)
#[1] 0.0006412673            #AR
#sum((cloan2008-predict(model2,12)$pred)^2)
#[1] 0.00053344              #MA
#sum((cloan2008-predict(model3,12)$pred)^2)
#[1] 0.000849362             #ARMA
#sum((cloan2008-predict(model4,12)$pred)^2)
#[1] 0.000775068             #Seasonal

### All small. Usually in such a case the more parcimonius model is chosen.