require(plyr)
options(stringsAsFactors = FALSE)
setwd("~/asus")
DEFAULT_DECAY <- -.05
SaleTrain <- read.csv("SaleTrain.csv")
RepairTrain <- read.csv("RepairTrain.csv")
mapping <- read.csv("Output_TargetID_Mapping.csv")
mapping$id <- 1:nrow(mapping)
# Fix goofy names in the source data
names(SaleTrain)[3] <- "year_month_sale"
names(RepairTrain)[3] <- "year_month_sale"
names(RepairTrain)[4] <- "year_month_repair"
# Create derived variables
repair_train <- transform(RepairTrain,
year_repair = as.integer(substr(year_month_repair, 1, 4)),
month_repair = as.integer(substr(year_month_repair, 6, 7)),
year_sale = as.integer(substr(year_month_sale, 1, 4)),
month_sale = as.integer(substr(year_month_sale, 6, 7)))
repair_train <- transform(repair_train,
year_month_repair = year_repair * 100 + month_repair,
year_month_sale = year_sale * 100 + month_sale,
number_repair = pmax(number_repair, 0))
# Right now just projecting off the last six months in the experience period
repair_train <- subset(repair_train, year_month_repair >= 200907)
# repair_train is at the individual repair level, roll it up to make predictions
repair_agg <- aggregate(number_repair ~ module_category + component_category +
year_month_repair, repair_train, sum)
repair_agg$t <- repair_agg$year_month_repair - 200907
# Create a block_id for each module/component combination
df_id <- unique(mapping[ , c("module_category", "component_category")])
df_id$block_id <- 1:nrow(df_id)
repair_agg <- merge(repair_agg, df_id)
# Function for fitting exponential decay models to repair counts
linmod <- function(df) {
lm
(log(number_repair
) ~ t
, data
= df
)$coef
}
# Compute a model for each module/component combination
models <- ddply(repair_agg, .(block_id), linmod)
avg <- with(repair_agg, tapply(number_repair, block_id, mean))
ind <- models$t > -.001
ind[is.na(ind)] <- FALSE
models$t[ind] <- DEFAULT_DECAY
models$
"(Intercept)"[ind
] <- log(avg
[ind
]) - 5*DEFAULT_DECAY
# Join model coefficients to test data and make predictions
mapping <- merge(mapping, df_id, all.x = TRUE)
mapping <- merge(mapping, models, all.x = TRUE )
mapping
<- rename(mapping
, c
("(Intercept)"="beta0", "t" = "beta1"))mapping$t <- with(mapping, (year - 2009) * 12 +(month - 7))
mapping$pred
<- round
(with
(mapping
, round
(exp(beta1
*t
+ beta0
), 1)), 0)
# NAs for model coefficients means we did not have enough non-zero
# data for a fit, so 0 is the appropriate prediction
mapping$pred[is.na(mapping$pred)] <- 0
# Two out of three zeros filter
zero_check <- ddply(repair_agg, .(block_id), summarize,
nonzero = sum(number_repair > 0 & year_month_repair >= 200910))
mapping <- merge(mapping, zero_check, all.x = T)
mapping$pred <- with(mapping, ifelse(!is.na(nonzero) & nonzero <= 1, 0, pred))
sub <- mapping[, c("id", "pred")]
colnames(sub) <- c("id", "target")
sub <- arrange(sub, id)
write.csv(sub, "submission.csv", row.names=F)
cmVxdWlyZShwbHlyKQpvcHRpb25zKHN0cmluZ3NBc0ZhY3RvcnMgPSBGQUxTRSkKc2V0d2QoIn4vYXN1cyIpCgpERUZBVUxUX0RFQ0FZIDwtIC0uMDUKClNhbGVUcmFpbiA8LSByZWFkLmNzdigiU2FsZVRyYWluLmNzdiIpClJlcGFpclRyYWluIDwtIHJlYWQuY3N2KCJSZXBhaXJUcmFpbi5jc3YiKQptYXBwaW5nIDwtIHJlYWQuY3N2KCJPdXRwdXRfVGFyZ2V0SURfTWFwcGluZy5jc3YiKQoKbWFwcGluZyRpZCA8LSAxOm5yb3cobWFwcGluZykKCiMgRml4IGdvb2Z5IG5hbWVzIGluIHRoZSBzb3VyY2UgZGF0YQpuYW1lcyhTYWxlVHJhaW4pWzNdIDwtICJ5ZWFyX21vbnRoX3NhbGUiCm5hbWVzKFJlcGFpclRyYWluKVszXSA8LSAieWVhcl9tb250aF9zYWxlIgpuYW1lcyhSZXBhaXJUcmFpbilbNF0gPC0gInllYXJfbW9udGhfcmVwYWlyIgoKIyBDcmVhdGUgZGVyaXZlZCB2YXJpYWJsZXMKcmVwYWlyX3RyYWluIDwtIHRyYW5zZm9ybShSZXBhaXJUcmFpbiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgeWVhcl9yZXBhaXIgID0gYXMuaW50ZWdlcihzdWJzdHIoeWVhcl9tb250aF9yZXBhaXIsIDEsIDQpKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgbW9udGhfcmVwYWlyID0gYXMuaW50ZWdlcihzdWJzdHIoeWVhcl9tb250aF9yZXBhaXIsIDYsIDcpKSwKICAgICAgICAgICAgICAgICAgICAgICAgICB5ZWFyX3NhbGUgICAgPSBhcy5pbnRlZ2VyKHN1YnN0cih5ZWFyX21vbnRoX3NhbGUsIDEsIDQpKSwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgbW9udGhfc2FsZSAgID0gYXMuaW50ZWdlcihzdWJzdHIoeWVhcl9tb250aF9zYWxlLCA2LCA3KSkpCgpyZXBhaXJfdHJhaW4gPC0gdHJhbnNmb3JtKHJlcGFpcl90cmFpbiwgCiAgICAgICAgICAgICAgICAgICAgICAgICAgeWVhcl9tb250aF9yZXBhaXIgPSB5ZWFyX3JlcGFpciAqIDEwMCArIG1vbnRoX3JlcGFpciwKICAgICAgICAgICAgICAgICAgICAgICAgICB5ZWFyX21vbnRoX3NhbGUgPSB5ZWFyX3NhbGUgKiAxMDAgKyBtb250aF9zYWxlLAogICAgICAgICAgICAgICAgICAgICAgICAgIG51bWJlcl9yZXBhaXIgPSBwbWF4KG51bWJlcl9yZXBhaXIsIDApKQoKIyBSaWdodCBub3cganVzdCBwcm9qZWN0aW5nIG9mZiB0aGUgbGFzdCBzaXggbW9udGhzIGluIHRoZSBleHBlcmllbmNlIHBlcmlvZApyZXBhaXJfdHJhaW4gPC0gc3Vic2V0KHJlcGFpcl90cmFpbiwgeWVhcl9tb250aF9yZXBhaXIgPj0gMjAwOTA3KQoKIyByZXBhaXJfdHJhaW4gaXMgYXQgdGhlIGluZGl2aWR1YWwgcmVwYWlyIGxldmVsLCByb2xsIGl0IHVwIHRvIG1ha2UgcHJlZGljdGlvbnMKcmVwYWlyX2FnZyA8LSBhZ2dyZWdhdGUobnVtYmVyX3JlcGFpciB+IG1vZHVsZV9jYXRlZ29yeSArIGNvbXBvbmVudF9jYXRlZ29yeSArCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHllYXJfbW9udGhfcmVwYWlyLCByZXBhaXJfdHJhaW4sIHN1bSkKcmVwYWlyX2FnZyR0IDwtIHJlcGFpcl9hZ2ckeWVhcl9tb250aF9yZXBhaXIgLSAyMDA5MDcKCiMgQ3JlYXRlIGEgYmxvY2tfaWQgZm9yIGVhY2ggbW9kdWxlL2NvbXBvbmVudCBjb21iaW5hdGlvbgpkZl9pZCA8LSB1bmlxdWUobWFwcGluZ1sgLCBjKCJtb2R1bGVfY2F0ZWdvcnkiLCAiY29tcG9uZW50X2NhdGVnb3J5IildKQpkZl9pZCRibG9ja19pZCA8LSAxOm5yb3coZGZfaWQpCnJlcGFpcl9hZ2cgPC0gbWVyZ2UocmVwYWlyX2FnZywgZGZfaWQpCgojIEZ1bmN0aW9uIGZvciBmaXR0aW5nIGV4cG9uZW50aWFsIGRlY2F5IG1vZGVscyB0byByZXBhaXIgY291bnRzCmxpbm1vZCA8LSBmdW5jdGlvbihkZikgewogIGxtKGxvZyhudW1iZXJfcmVwYWlyKSB+IHQsIGRhdGEgPSBkZikkY29lZgp9CgojIENvbXB1dGUgYSBtb2RlbCBmb3IgZWFjaCBtb2R1bGUvY29tcG9uZW50IGNvbWJpbmF0aW9uCm1vZGVscyA8LSBkZHBseShyZXBhaXJfYWdnLCAuKGJsb2NrX2lkKSwgbGlubW9kKQphdmcgPC0gd2l0aChyZXBhaXJfYWdnLCB0YXBwbHkobnVtYmVyX3JlcGFpciwgYmxvY2tfaWQsIG1lYW4pKQppbmQgPC0gbW9kZWxzJHQgPiAtLjAwMQppbmRbaXMubmEoaW5kKV0gPC0gRkFMU0UKbW9kZWxzJHRbaW5kXSA8LSBERUZBVUxUX0RFQ0FZCm1vZGVscyQiKEludGVyY2VwdCkiW2luZF0gPC0gbG9nKGF2Z1tpbmRdKSAtIDUqREVGQVVMVF9ERUNBWQoKIyBKb2luIG1vZGVsIGNvZWZmaWNpZW50cyB0byB0ZXN0IGRhdGEgYW5kIG1ha2UgcHJlZGljdGlvbnMKbWFwcGluZyA8LSBtZXJnZShtYXBwaW5nLCBkZl9pZCwgYWxsLnggPSBUUlVFKQptYXBwaW5nIDwtIG1lcmdlKG1hcHBpbmcsIG1vZGVscywgYWxsLnggPSBUUlVFICkKbWFwcGluZyA8LSByZW5hbWUobWFwcGluZywgYygiKEludGVyY2VwdCkiPSJiZXRhMCIsICJ0IiA9ICJiZXRhMSIpKQptYXBwaW5nJHQgPC0gd2l0aChtYXBwaW5nLCAoeWVhciAtIDIwMDkpICogMTIgKyhtb250aCAtIDcpKQptYXBwaW5nJHByZWQgPC0gcm91bmQod2l0aChtYXBwaW5nLCByb3VuZChleHAoYmV0YTEqdCArIGJldGEwKSwgMSkpLCAwKQoKIyBOQXMgZm9yIG1vZGVsIGNvZWZmaWNpZW50cyBtZWFucyB3ZSBkaWQgbm90IGhhdmUgZW5vdWdoIG5vbi16ZXJvCiMgZGF0YSBmb3IgYSBmaXQsIHNvIDAgaXMgdGhlIGFwcHJvcHJpYXRlIHByZWRpY3Rpb24KbWFwcGluZyRwcmVkW2lzLm5hKG1hcHBpbmckcHJlZCldIDwtIDAKCiMgVHdvIG91dCBvZiB0aHJlZSB6ZXJvcyBmaWx0ZXIKemVyb19jaGVjayA8LSBkZHBseShyZXBhaXJfYWdnLCAuKGJsb2NrX2lkKSwgc3VtbWFyaXplLCAKICAgICAgICAgICAgICAgICAgICBub256ZXJvID0gc3VtKG51bWJlcl9yZXBhaXIgPiAwICYgeWVhcl9tb250aF9yZXBhaXIgPj0gMjAwOTEwKSkKbWFwcGluZyA8LSBtZXJnZShtYXBwaW5nLCB6ZXJvX2NoZWNrLCBhbGwueCA9IFQpCm1hcHBpbmckcHJlZCA8LSB3aXRoKG1hcHBpbmcsIGlmZWxzZSghaXMubmEobm9uemVybykgJiBub256ZXJvIDw9IDEsIDAsIHByZWQpKQoKc3ViIDwtIG1hcHBpbmdbLCBjKCJpZCIsICJwcmVkIildCmNvbG5hbWVzKHN1YikgPC0gYygiaWQiLCAidGFyZ2V0IikKc3ViIDwtIGFycmFuZ2Uoc3ViLCBpZCkKd3JpdGUuY3N2KHN1YiwgInN1Ym1pc3Npb24uY3N2Iiwgcm93Lm5hbWVzPUYpCg==