fork download
  1. require(plyr)
  2. options(stringsAsFactors = FALSE)
  3. setwd("~/asus")
  4.  
  5. DEFAULT_DECAY <- -.05
  6.  
  7. SaleTrain <- read.csv("SaleTrain.csv")
  8. RepairTrain <- read.csv("RepairTrain.csv")
  9. mapping <- read.csv("Output_TargetID_Mapping.csv")
  10.  
  11. mapping$id <- 1:nrow(mapping)
  12.  
  13. # Fix goofy names in the source data
  14. names(SaleTrain)[3] <- "year_month_sale"
  15. names(RepairTrain)[3] <- "year_month_sale"
  16. names(RepairTrain)[4] <- "year_month_repair"
  17.  
  18. # Create derived variables
  19. repair_train <- transform(RepairTrain,
  20. year_repair = as.integer(substr(year_month_repair, 1, 4)),
  21. month_repair = as.integer(substr(year_month_repair, 6, 7)),
  22. year_sale = as.integer(substr(year_month_sale, 1, 4)),
  23. month_sale = as.integer(substr(year_month_sale, 6, 7)))
  24.  
  25. repair_train <- transform(repair_train,
  26. year_month_repair = year_repair * 100 + month_repair,
  27. year_month_sale = year_sale * 100 + month_sale,
  28. number_repair = pmax(number_repair, 0))
  29.  
  30. # Right now just projecting off the last six months in the experience period
  31. repair_train <- subset(repair_train, year_month_repair >= 200907)
  32.  
  33. # repair_train is at the individual repair level, roll it up to make predictions
  34. repair_agg <- aggregate(number_repair ~ module_category + component_category +
  35. year_month_repair, repair_train, sum)
  36. repair_agg$t <- repair_agg$year_month_repair - 200907
  37.  
  38. # Create a block_id for each module/component combination
  39. df_id <- unique(mapping[ , c("module_category", "component_category")])
  40. df_id$block_id <- 1:nrow(df_id)
  41. repair_agg <- merge(repair_agg, df_id)
  42.  
  43. # Function for fitting exponential decay models to repair counts
  44. linmod <- function(df) {
  45. lm(log(number_repair) ~ t, data = df)$coef
  46. }
  47.  
  48. # Compute a model for each module/component combination
  49. models <- ddply(repair_agg, .(block_id), linmod)
  50. avg <- with(repair_agg, tapply(number_repair, block_id, mean))
  51. ind <- models$t > -.001
  52. ind[is.na(ind)] <- FALSE
  53. models$t[ind] <- DEFAULT_DECAY
  54. models$"(Intercept)"[ind] <- log(avg[ind]) - 5*DEFAULT_DECAY
  55.  
  56. # Join model coefficients to test data and make predictions
  57. mapping <- merge(mapping, df_id, all.x = TRUE)
  58. mapping <- merge(mapping, models, all.x = TRUE )
  59. mapping <- rename(mapping, c("(Intercept)"="beta0", "t" = "beta1"))
  60. mapping$t <- with(mapping, (year - 2009) * 12 +(month - 7))
  61. mapping$pred <- round(with(mapping, round(exp(beta1*t + beta0), 1)), 0)
  62.  
  63. # NAs for model coefficients means we did not have enough non-zero
  64. # data for a fit, so 0 is the appropriate prediction
  65. mapping$pred[is.na(mapping$pred)] <- 0
  66.  
  67. # Two out of three zeros filter
  68. zero_check <- ddply(repair_agg, .(block_id), summarize,
  69. nonzero = sum(number_repair > 0 & year_month_repair >= 200910))
  70. mapping <- merge(mapping, zero_check, all.x = T)
  71. mapping$pred <- with(mapping, ifelse(!is.na(nonzero) & nonzero <= 1, 0, pred))
  72.  
  73. sub <- mapping[, c("id", "pred")]
  74. colnames(sub) <- c("id", "target")
  75. sub <- arrange(sub, id)
  76. write.csv(sub, "submission.csv", row.names=F)
  77.  
Runtime error #stdin #stdout #stderr 0.63s 22832KB
stdin
Standard input is empty
stdout
Standard output is empty
stderr
Loading required package: plyr
Error in setwd("~/asus") : cannot change working directory
Execution halted