## ----install and load---------------------------------------------- install.packages('tidyverse') install.packages("sparklyr") spark_install() install.packages("titanic") install.packages('SparkR') sc <- spark_connect(master = "local") library(sparklyr) library(dplyr) library(tidyr) library(titanic) library(ggplot2) library(purrr) library(jsonlite) library(readr) all_data_tbl <- spark_read_csv(sc, name="all_data", "final_data.csv") ## ----partition----------------------------------------------------------- # Partition the data partition <- all_data_tbl %>% mutate(Embassador = as.double(Embassador)) %>% sdf_partition(train = 0.75, test = 0.25, seed = 8585) # Create table references train_tbl <- partition$train test_tbl <- partition$test ## ----train--------------------------------------------------------------- # Model survival as a function of several predictors ml_formula <- formula(Embassador ~ Total_interactions + Total_Boxes_per_user + Pieces_consumed_per_week + Pieces_consumed_during_this_order + Price_of_this_order + No_orders_client + Order_duration + Date_difference + Avg_boxes_per_week + Boxes_during_this_order) # Train a logistic regression model (ml_log <- ml_logistic_regression(train_tbl, ml_formula)) ## ----ml------------------------------------------------------------------ ## Decision Tree ml_dt <- ml_decision_tree(train_tbl, ml_formula) ## Random Forest ml_rf <- ml_random_forest(train_tbl, ml_formula) ## Gradient Boosted Tree ml_gbt <- ml_gradient_boosted_trees(train_tbl, ml_formula) ## Naive Bayes ml_nb <- ml_naive_bayes(train_tbl, ml_formula) ## Neural Network ml_nn <- ml_multilayer_perceptron(train_tbl, ml_formula, layers = c(16,15,12)) ## ----score--------------------------------------------------------------- # Bundle the modelss into a single list object ml_models <- list( "Logistic" = ml_log, "Decision Tree" = ml_dt, "Random Forest" = ml_rf, "Gradient Boosted Trees" = ml_gbt, "Naive Bayes" = ml_nb, "Neural Net" = ml_nn ) # Create a function for scoring score_test_data <- function(model, data=test_tbl){ pred <- sdf_predict(model, data) select(pred, Embassador, prediction) } # Score all the models ml_score <- lapply(ml_models, score_test_data) ml_score ## ----auc----------------------------------------------------------------- # Function for calculating accuracy calc_accuracy <- function(data, cutpoint = 0.5){ data %>% mutate(prediction = if_else(prediction > cutpoint, 1.0, 0.0)) %>% ml_classification_eval("prediction", "Embassador", "accuracy") } # Calculate AUC and accuracy perf_metrics <- data.frame( model = names(ml_score), AUC = 100 * sapply(ml_score, ml_binary_classification_eval, "Embassador", "prediction"), Accuracy = 100 * sapply(ml_score, calc_accuracy), row.names = NULL, stringsAsFactors = FALSE) write_csv(perf_metrics, path = "new.csv") # Plot results gather(perf_metrics, metric, value, AUC, Accuracy) %>% ggplot(aes(reorder(model, value), value, fill = metric)) + geom_bar(stat = "identity", position = "dodge") + coord_flip() + xlab("") + ylab("Percent") + ggtitle("Performance Metrics") ## ----time, warning = FALSE----------------------------------------------- # Number of reps per model n <- 10 # Format model formula as character format_as_character <- function(x){ x <- paste(deparse(x), collapse = "") x <- gsub("\\s+", " ", paste(x, collapse = "")) x } # Create model statements with timers format_statements <- function(y){ y <- format_as_character(y[[".call"]]) y <- gsub('ml_formula', ml_formula_char, y) y <- paste0("system.time(", y, ")") y } # Convert model formula to character ml_formula_char <- format_as_character(ml_formula) # Create n replicates of each model statements with timers all_statements <- sapply(ml_models, format_statements) %>% rep(., n) %>% parse(text = .) # Evaluate all model statements res <- map(all_statements, eval) # Compile results result <- data.frame(model = rep(names(ml_models), n), time = sapply(res, function(x){as.numeric(x["elapsed"])})) # Plot result %>% ggplot(aes(time, reorder(model, time))) + geom_boxplot() + geom_jitter(width = 0.4, aes(colour = model)) + scale_colour_discrete(guide = FALSE) + xlab("Seconds") + ylab("") + ggtitle("Model training times")