fork download
  1. library(dplyr) # data function
  2. library(data.table) # data format
  3. library(snow) #paralle computing
  4. library(hydroGOF) #RMSE
  5.  
  6.  
  7. set.seed(524287)
  8.  
  9. Cartype<-'31'
  10. FreewayNum<-'01'
  11. DT<-fread("C:/Users/lwang2/Documents/R/20160420SpeedCount/01S_20160301_20160331.csv",na.strings = 'NULL')
  12. DT <- DT[,which(unlist(lapply(DT, function(x)!all(is.na(x))))),with=F]
  13. DT<-filter(DT,Category==Cartype)
  14.  
  15. drops<-c('Date','Category','Hour')
  16. DT[,drops[1:length(drops)]:=NULL]
  17.  
  18.  
  19. # replace missing values with Normal distribution Mean(col) sd(col)
  20.  
  21. #for(j in seq_len(ncol(DT)))
  22. #{
  23. # ##Sys.sleep(0.1)
  24. # set(DT,which(is.na(DT[[j]])),j,abs(rnorm(5000,mean=mean(DT[[j]],na.rm=T),sd=sd(DT[[j]],na.rm = T))))
  25. # ##print(j)
  26. # ##setTxtProgressBar(pb, i)
  27. #}
  28.  
  29. ################################ Replace NA by Each Column Min################################
  30. for(j in seq_len(ncol(DT)))
  31. {
  32. ##Sys.sleep(0.1)
  33. set(DT,which(is.na(DT[[j]])),j,min(DT[[j]],na.rm = T))
  34. ##print(j)
  35. ##setTxtProgressBar(pb, i)
  36. }
  37.  
  38.  
  39. TrainDT<-DT[1:round(0.7*nrow(DT),0)]
  40. TrainDT<-select(TrainDT,starts_with(FreewayNum))
  41.  
  42. PracticeDT<-DT[round(0.7*nrow(DT),0):nrow(DT)]
  43. PracticeDT<-select(PracticeDT,starts_with(FreewayNum))
  44.  
  45. summaryDF<-data.frame(GantryID=character(),Cor=double(),RMSE=double(),ResidualMin=double(),ResidualQ1=double(),ResidualMedian=double(),ResidualMean=double(),ResidualQ3=double(),ResidualMax=double(),NaNum=double())
  46.  
  47.  
  48.  
  49. #####################Snow Multiple Thread###########################
  50. clusterfun<-function(i){
  51. print(i)
  52. TrainModel<-cbind(setnames(TrainDT[7:nrow(TrainDT),i,with=F],paste0(names(TrainDT[1,i,with=FALSE]),'_y')),TrainDT[1:(nrow(TrainDT)-6),1:length(TrainDT),with=F])
  53. PracticeModel<-cbind(setnames(PracticeDT[7:nrow(PracticeDT),i,with=F],paste0(names(TrainDT[1,i,with=FALSE]),'_y')),PracticeDT[1:(nrow(PracticeDT)-6),1:length(PracticeDT),with=F])
  54. resp<-grep('_y',names(TrainModel),value=T)
  55. pre<-grep(FreewayNum,names(TrainModel),value =T)
  56. pre<-pre[2:length(pre)]
  57.  
  58. ###### Functions ####
  59. remove_outliers<-function(x,na.rm=T)
  60. {
  61. qnt<-quantile(x,probs = c(.25,.75),na.rm = na.rm)
  62. H<-1.5*IQR(x,na.rm = na.rm)
  63. y<-x
  64. y[x<(qnt[1]-H)]<-NA
  65. y[x>(qnt[2]+H)]<-NA
  66. return(y)
  67. }
  68. addq<-function(x) paste0("`",x, "`")
  69.  
  70.  
  71.  
  72. Model<-as.formula(paste(addq(resp),paste(lapply(pre, addq),collapse = '+'),sep = '~'))
  73.  
  74. FitModel<-lm(Model,data=TrainModel)
  75. #Fitmodel<-lm(`01F0017S_y`~.,data=TrainModel)
  76. #Fitmodel<-lm(as.matrix(TrainDT[7:nrow(TrainDT),i,with=F])~as.matrix(TrainDT[1:(nrow(TrainDT)-6),1:length(TrainDT),with=F]),data=TrainDT)
  77. stepwise<-step(FitModel,sacle=0,direction = 'both')
  78. predictresidual<-PracticeModel[[1]]-predict(stepwise,PracticeModel)
  79. RMSE<-rmse(remove_outliers(predict(stepwise,PracticeModel)),remove_outliers(PracticeModel[[1]]),na.rm = T)
  80. Gantryname<-names(TrainDT[1,i,with=FALSE])
  81. write.csv(stepwise$coefficients,file = paste0(Gantryname,'_Coefficients','.csv'))
  82. write.csv(cbind(TrainModel[[1]],stepwise$fitted.values,stepwise$residuals),file = paste0(Gantryname,'_Residual','.csv'))
  83. write.csv(cbind(PracticeModel[[1]],predict(stepwise,PracticeModel),predictresidual),file = paste0(Gantryname,'_Predict','.csv'))
  84. SumResidual<-summary(remove_outliers(predictresidual))
  85. unlistResidual<-data.frame(matrix(unlist(SumResidual),ncol = 7,byrow = T))
  86. summaryDF<-rbind(summaryDF,cbind(Gantryname,cor(PracticeModel[[1]],predict(stepwise,PracticeModel)),RMSE,unlistResidual))
  87. ##summaryTest<-rbind(summaryTest,cbind(data.frame(t(as.matrix(unlist(stepwise$coefficients),ncol=length(stepwise$coefficients),byrow=T)))))
  88.  
  89. #PredictData<-predict(stepwise,PracticeDT)
  90. if (i ==3)({
  91. colnames(summaryDF)<-c('GantryID','Cor','RMSE','ResidualMin','ResidualQ1','ResidualMedian','ResidualMean','ResidualQ3','ResidualMax','NaNum')
  92. write.csv(summaryDF,file = 'Summary.csv')
  93. })
  94.  
  95. }
  96.  
  97. cluster <- makeCluster(type="SOCK",c("localhost", "localhost", "localhost", "localhost"))
  98.  
  99. clusterEvalQ(cluster,c(library(data.table),library(hydroGOF)))
  100. clusterExport(cluster,c('TrainDT','DT','PracticeDT','FreewayNum','summaryDF'))
  101. system.time(parLapply(cluster,1:3,clusterfun))
  102.  
  103. stopCluster(cluster)
  104.  
  105.  
Success #stdin #stdout #stderr 0.45s 79168KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Error in library(dplyr) : there is no package called ‘dplyr’
Execution halted