Best R Packages for Machine Learning (original) (raw)

`install.packages("data.table") install.packages("dplyr") install.packages("ggplot2") install.packages("caret") install.packages("xgboost") install.packages("e1071") install.packages("cowplot")

library(data.table) library(dplyr) library(ggplot2) library(caret) library(xgboost) library(e1071) library(cowplot)

test[, Item_Outlet_Sales := NA]
combi = rbind(train, test)

missing_index = which(is.na(combi$Item_Weight)) for(i in missing_index){ item = combi$Item_Identifier[i] combi$Item_Weight[i] = mean(combi$Item_Weight[combi$Item_Identifier == item], na.rm = T) }

zero_index = which(combi$Item_Visibility == 0) for(i in zero_index){ item = combi$Item_Identifier[i] combi$Item_Visibility[i] = mean(combi$Item_Visibility[combi$Item_Identifier == item], na.rm = T) }

combi[, Outlet_Size_num := ifelse(Outlet_Size == "Small", 0, ifelse(Outlet_Size == "Medium", 1, 2))] combi[, Outlet_Location_Type_num := ifelse(Outlet_Location_Type == "Tier 3", 0, ifelse(Outlet_Location_Type == "Tier 2", 1, 2))] combi[, c("Outlet_Size", "Outlet_Location_Type") := NULL]

ohe_1 = dummyVars("~.", data = combi[, -c("Item_Identifier", "Outlet_Establishment_Year", "Item_Type")], fullRank = T) ohe_df = data.table(predict(ohe_1, combi[, -c("Item_Identifier", "Outlet_Establishment_Year", "Item_Type")])) combi = cbind(combi[, "Item_Identifier"], ohe_df)

skewness(combi$Item_Visibility)
skewness(combi$price_per_unit_wt)

combi[, Item_Visibility := log(Item_Visibility + 1)]

num_vars = which(sapply(combi, is.numeric))
num_vars_names = names(num_vars) combi_numeric = combi[, setdiff(num_vars_names, "Item_Outlet_Sales"), with = F] prep_num = preProcess(combi_numeric, method = c("center", "scale")) combi_numeric_norm = predict(prep_num, combi_numeric) combi[, setdiff(num_vars_names, "Item_Outlet_Sales") := NULL]
combi = cbind(combi, combi_numeric_norm)

train = combi[1:nrow(train)] test = combi[(nrow(train) + 1):nrow(combi)] test[, Item_Outlet_Sales := NULL]

param_list = list( objective = "reg:linear", eta = 0.01, gamma = 1, max_depth = 6, subsample = 0.8, colsample_bytree = 0.5 )

Dtrain = xgb.DMatrix(data = as.matrix(train[, -c("Item_Identifier", "Item_Outlet_Sales")]), label = train$Item_Outlet_Sales) Dtest = xgb.DMatrix(data = as.matrix(test[, -c("Item_Identifier")]))

set.seed(112) xgbcv = xgb.cv(params = param_list, data = Dtrain, nrounds = 1000, nfold = 5, print_every_n = 10, early_stopping_rounds = 30, maximize = F)

xgb_model = xgb.train(data = Dtrain, params = param_list, nrounds = 428) xgb_model

`