library(dplyr)  # data function
library(data.table) # data format
library(snow)  #paralle computing
library(hydroGOF)  #RMSE


set.seed(524287)

Cartype<-'31'
FreewayNum<-'01'
DT<-fread("C:/Users/lwang2/Documents/R/20160420SpeedCount/01S_20160301_20160331.csv",na.strings = 'NULL')
DT <- DT[,which(unlist(lapply(DT, function(x)!all(is.na(x))))),with=F]
DT<-filter(DT,Category==Cartype)

drops<-c('Date','Category','Hour')
DT[,drops[1:length(drops)]:=NULL]


# replace missing values with Normal distribution Mean(col) sd(col)

#for(j in seq_len(ncol(DT)))
#{
#  ##Sys.sleep(0.1)
#  set(DT,which(is.na(DT[[j]])),j,abs(rnorm(5000,mean=mean(DT[[j]],na.rm=T),sd=sd(DT[[j]],na.rm = T))))
#  ##print(j) 
#  ##setTxtProgressBar(pb, i)
#}

################################ Replace NA by Each Column Min################################ 
for(j in seq_len(ncol(DT)))
{
  ##Sys.sleep(0.1)
  set(DT,which(is.na(DT[[j]])),j,min(DT[[j]],na.rm = T))
  ##print(j) 
  ##setTxtProgressBar(pb, i)
}


TrainDT<-DT[1:round(0.7*nrow(DT),0)]
TrainDT<-select(TrainDT,starts_with(FreewayNum))

PracticeDT<-DT[round(0.7*nrow(DT),0):nrow(DT)]
PracticeDT<-select(PracticeDT,starts_with(FreewayNum))

summaryDF<-data.frame(GantryID=character(),Cor=double(),RMSE=double(),ResidualMin=double(),ResidualQ1=double(),ResidualMedian=double(),ResidualMean=double(),ResidualQ3=double(),ResidualMax=double(),NaNum=double())



#######Function Here: Remove outlier & addq####################################

remove_outliers<-function(x,na.rm=T)
{
qnt<-quantile(x,probs = c(.25,.75),na.rm = na.rm)
H<-1.5*IQR(x,na.rm = na.rm)
y<-x
y[x<(qnt[1]-H)]<-NA
y[x>(qnt[2]+H)]<-NA
return(y)
}
addq<-function(x) paste0("`",x, "`")
##Test<-data.frame()
##GantryTestName<-data.frame(Intercept=numeric(),cbind(Test,t(as.matrix(lapply(names(TrainDT),addq)))))





######Model with shift 30 Mins: y8:30=x8:00
system.time(for (i in 1:2)
{
  print(i)
  TrainModel<-cbind(setnames(TrainDT[7:nrow(TrainDT),i,with=F],paste0(names(TrainDT[1,i,with=FALSE]),'_y')),TrainDT[1:(nrow(TrainDT)-6),1:length(TrainDT),with=F])
  PracticeModel<-cbind(setnames(PracticeDT[7:nrow(PracticeDT),i,with=F],paste0(names(TrainDT[1,i,with=FALSE]),'_y')),PracticeDT[1:(nrow(PracticeDT)-6),1:length(PracticeDT),with=F])
  resp<-grep('_y',names(TrainModel),value=T)
  pre<-grep(FreewayNum,names(TrainModel),value =T)
  pre<-pre[2:length(pre)]
  Model<-as.formula(paste(addq(resp),paste(lapply(pre, addq),collapse = '+'),sep = '~'))
  
  FitModel<-lm(Model,data=TrainModel)
  #Fitmodel<-lm(`01F0017S_y`~.,data=TrainModel)
  #Fitmodel<-lm(as.matrix(TrainDT[7:nrow(TrainDT),i,with=F])~as.matrix(TrainDT[1:(nrow(TrainDT)-6),1:length(TrainDT),with=F]),data=TrainDT)
  stepwise<-step(FitModel,sacle=0,direction = 'both')
  predictresidual<-PracticeModel[[1]]-predict(stepwise,PracticeModel)
  RMSE<-rmse(remove_outliers(predict(stepwise,PracticeModel)),remove_outliers(PracticeModel[[1]]),na.rm = T)
  Gantryname<-names(TrainDT[1,i,with=FALSE])
  write.csv(stepwise$coefficients,file = paste0(Gantryname,'_Coefficients','.csv'))
  write.csv(cbind(TrainModel[[1]],stepwise$fitted.values,stepwise$residuals),file = paste0(Gantryname,'_Residual','.csv'))
  write.csv(cbind(PracticeModel[[1]],predict(stepwise,PracticeModel),predictresidual),file = paste0(Gantryname,'_Predict','.csv'))
  SumResidual<-summary(remove_outliers(predictresidual))
  unlistResidual<-data.frame(matrix(unlist(SumResidual),ncol = 7,byrow = T))
  summaryDF<-rbind(summaryDF,cbind(Gantryname,cor(PracticeModel[[1]],predict(stepwise,PracticeModel)),RMSE,unlistResidual))
  ##summaryTest<-rbind(summaryTest,cbind(data.frame(t(as.matrix(unlist(stepwise$coefficients),ncol=length(stepwise$coefficients),byrow=T)))))
  
  #PredictData<-predict(stepwise,PracticeDT)
  if (i ==2)({
    colnames(summaryDF)<-c('GantryID','Cor','RMSE','ResidualMin','ResidualQ1','ResidualMedian','ResidualMean','ResidualQ3','ResidualMax','NaNum')
    write.csv(summaryDF,file = 'Summary.csv')
  })
  
})