# Kaggle Walmart recruiting competition 2014-02-20 to 2014-05-05. WORKING_DIRECTORY = "~/walmart" options(stringsAsFactors = FALSE) setwd(WORKING_DIRECTORY) library(Hmisc) # Hmisc is first so its summarize function does not mask plyr's library(plyr) library(testthat) library(lubridate) library(stringr) trend_sales <- function(v_sales, v_id, v_dt, id_num, trend_fctr) { # Apply a trend factor to the historical sales, moving them to the # beginning of the test period. # # Args: # v_sales: Vector of all sales in the test set. # v_id: Vector of all store or department ids in the test set. # v_dt: Vector of all dates in the test set. # id_num: The store or department for which sales are to be trended. # trend_fctr: The historical (not prospective) trend factor. # # Returns: # The revised sales vector with trend applied to the # components corresponding to id_num. ind <- which(v_id == id_num) wks_between <- as.integer(difftime(v_dt[ind], min(v_dt[ind]), units="weeks")) fctr <- trend_fctr^(1/52 * (52 - wks_between)) v_sales[ind] <- round(v_sales[ind] * fctr, 2) return(v_sales) } blend_weeks <- function(next_yr_dt, coef1 = NULL, coef2 = NULL) { # Given a date from the test set, the week ending on the corresponding date # in the training set will usually straddle two training weeks. This function # calculates an appropriate weighted average of the train weeks for # predicting the test week. # # Args: # next_year_dt: An end of week date (must be a Friday) from the test set # coef1, coef2: Specify the weights rather than calculating them. Not used. # # Returns: # A data frame with the test set id and predicted sales for next_yr_dt. # # Note: # Dataframes test and train are used globally and are referenced within the # blend_weeks function, although not passed as arguments. stopifnot(wday(next_yr_dt) == 6) # End of week must be a Friday. dt <- next_yr_dt - years(1) stopifnot(wday(dt) != 6) days_to_friday <- (13 - wday(dt)) %% 7 next_friday <- dt + days(days_to_friday) prev_friday <- next_friday - days(7) stopifnot(wday(next_friday) == 6) stopifnot(wday(prev_friday) == 6) df1 <- subset(train, dt == next_friday) df2 <- subset(train, dt == prev_friday) df_valid <- subset(test, dt == next_yr_dt)[, c("Store", "Dept")] df_both <- merge(df1[, 1:4], df2[, 1:4], by = c("Store", "Dept"), all = TRUE) df_both <- merge(df_valid, df_both, by = c("Store", "Dept"), all.x = T) df_both[, c("sales.x", "sales.y")] <- Hmisc::impute(df_both[, c("sales.x", "sales.y")], 0) if(is.null(coef1)) coef1 <- 1 - days_to_friday/7 if(is.null(coef2)) coef2 <- days_to_friday/7 blended_sales <- round(with(df_both, coef1 * sales.x + coef2 * sales.y), 0) Id <- with(df_both, paste(Store, Dept, next_yr_dt, sep = "_")) df_ans <- data.frame(Id = Id, sales = blended_sales) return(df_ans) } # Read and validate the data -------------------------------------------------- train <- readRDS("train.rds") # Training data covers 2010-02-05 to 2012-11-01 test <- readRDS("test.rds") # Test data covers 2012-11-02 to 2013-07-26 expect_equal(nrow(train), 421570) expect_equal(nrow(test), 115064) expect_equal(with(train, length(unique(paste(Store, Dept, Date)))), nrow(train)) expect_equal(with(test, length(unique(paste(Store, Dept, Date)))), nrow(test)) # Create derived variables ---------------------------------------------------- train <- mutate(train, dt = ymd(Date), yr = year(dt), wk = week(dt)) train <- rename(train, replace = c("Weekly_Sales" = "sales")) test <- mutate(test, dt = ymd(Date), yr = year(dt), wk = week(dt), prior_yr = yr - 1) # Map weeks of test period to corresponding weeks in train period ------------- # Week Mapping Adjustments: # Thanksgiving 2012 is in week 47, Thanksgiving 2011 in week 48, # thus 47 is replaced with 48 and 48 is replaced by 49. # # Easter 2013 is on March 31 (week 13). # Model week after Easter (14) by week after Easter (15). # For Easter week wound up just doing the same blending as for other weeks. test$wk <- plyr::mapvalues(test$wk, from = c(47, 48, 14), to = c(48, 49, 15)) # Make initial predictions ---------------------------------------------------- # Construct the initial test set predictions (just a merge with train, lagging # the test set by one year). ans <- merge(test, train, by.x = c("Store", "Dept", "prior_yr", "wk"), by.y = c("Store", "Dept", "yr", "wk"), all.x = TRUE) ans$sales[is.na(ans$sales)] <- 0 ans <- ans[, c("Store", "Dept", "Date.x", "sales")] ans$Id <- with(ans, paste(Store, Dept, Date.x, sep = "_")) # Week blending adjustments --------------------------------------------------- # Remove records in the test set that will be replaced by records derived # from blending. UNBLENDED_DATES <- c("2012-11-23", "2012-11-30", "2013-04-05") BLEND_DATES <- setdiff(as.character(ymd("2012-11-02") + weeks(0:38)), UNBLENDED_DATES) ans <- subset(ans, !(Date.x %in% BLEND_DATES)) sub <- ans[, c("Id", "sales")] # Calculate the blended weeks and add them back to sub using plyr::rbind.fill. blended_weeks <- plyr::rbind.fill(lapply(ymd(BLEND_DATES), blend_weeks)) sub <- rbind(sub, blended_weeks) # Reconstruct date, store, and department from the submission ----------------- # (awkward - could be cleaned up) dt <- ymd(str_extract(sub$Id, ".{10}$" )) store <- str_extract(sub$Id, "[0-9]+") dept <- substr(str_extract(sub$Id, "_[0-9]+"), 2, 3) # Make the trend adjustments (geometric mean of quarters). -------------------- store_trend_data <- list(c(1, 1.01), c(2, 1.01), c(3, 1.07), c(4, 1.02), c(5, 1.05), c(6, 1.01), c(7, 1.03), c(8, 1.00), c(9, 1.01), c(10, 0.97), c(11, 1.00), c(12, 0.99), c(13, 1.01), c(14, 0.85), c(15, 0.95), c(16, 0.99), c(17, 1.04), c(18, 1.03), c(19, 0.96), c(20, 0.99), c(21, 0.90), c(22, 0.97), c(23, 1), c(24, 0.99), c(25, 1.00), c(26, 1.00), c(27, 0.94), c(28, 0.95), c(29, 0.98), c(30, 1.01), c(31, 0.96), c(32, 0.99), c(33, 1.04), c(34, 1.01), c(35, 1.00), c(36, 0.80), c(37, 0.97), c(38, 1.10), c(39, 1.07), c(40, 0.99), c(41, 1.04), c(42, 1.00), c(43, 0.97), c(44, 1.08), c(45, 0.97)) for(v in store_trend_data) { sub$sales <- trend_sales(sub$sales, store, dt, v[1], v[2]) } dept_trend_data <- list(c(1, 0.96), c(2, 0.98), c(3, 1.01), c(4, 1), c(5, 0.91), c(6, 0.79), c(7, 0.99), c(8, 0.99), c(9, 1.03), c(10, 0.99), c(11, 0.98), c(12, 0.98), c(13, 0.98), c(14, 1.02), c(16, 0.95), c(17, 0.97), c(18, 0.87), c(19, 1.06), c(20, 0.98), c(21, 0.94), c(22, 1.01), c(23, 1.02), c(24, 1), c(25, 0.96), c(26, 0.96), c(27, 1.02), c(28, 0.89), c(29, 1.02), c(30, 0.92), c(31, 0.9), c(32, 0.97), c(33, 0.99), c(34, 1.02), c(35, 0.92), c(36, 0.79), c(37, 0.97), c(38, 0.98), c(40, 1.01), c(41, 0.94), c(42, 1.01), c(44, 1.02), c(45, 0.53), c(46, 0.99), c(48, 1.96), c(49, 0.96), c(50, 0.97), c(52, 0.93), c(54, 0.54), c(55, 0.83), c(56, 0.93), c(58, 1.13), c(59, 0.7), c(60, 1.02), c(65, 1.09), c(67, 1.02), c(71, 0.98), c(72, 0.96), c(74, 0.97), c(79, 0.98), c(80, 0.96), c(81, 0.98), c(82, 1.02), c(83, 1.01), c(85, 0.9), c(87, 1.14), c(90, 0.98), c(91, 0.98), c(92, 1.04), c(93, 1.02), c(94, 0.96), c(95, 0.99), c(96, 1.04), c(97, 0.97), c(98, 0.95), c(99, 1.19)) for(v in dept_trend_data) { sub$sales <- trend_sales(sub$sales, dept, dt, v[1], v[2]) } # Save the submission --------------------------------------------------------- sub <- sub[, c("Id", "sales")] names(sub) <- c("Id", "Weekly_Sales") sub <- arrange(sub, Id) expect_equal(nrow(sub), 115064) z <- gzfile("submission.csv.gz") write.csv(sub, z, row.names = FALSE)