Machine Learning with H2O

4 minute read

Introduction

Due to the proprietary nature of my work these are examples of recent work completed

Utilizing H2O is an open source platform that makes it easy for companies to utilize Big Data to deploy AI and deep learning to solve complex problems. In my experience I have used H2O in the final data preparation stages of the modeling process, the training process, validation process and model selection process. I have similar experience with SparklyR - and both are comparable but with the robustness of H2O we are able to train on much larger datasets. Below is an example of a potential use-case of taking a data set and fitting multiple models to that dataset and determining which performs the best.

Libraries

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#--| Call Libraries
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
library(h2o)
library(h2osteam)

library(dplyr)
library(sparklyr)
library(svDialogs)
library(RCurl)
library(ggplot2)

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

Set Up Connection

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#--| Connection Setup
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
steam_url = "enter_your_steam_url_here"
id = 'user123' #enter your userID
cluster_name = "Example_H2O_Cluster" #customize as needed
h2o_version = "3.33.1.5511"
node_memory = "5G"
num_nodes = 2
key = 123456789 #example

password = dlgInput("Enter password", Sys.info()["user"])$res

source('encryption.R') #R file containing encryption for connection

conn = h20steam.login(url = steam_url,
                      username = id,
                      password = decrypt(cipher_text, key),
                      verify_ssl = FALSE)

conf = h20steam.start_h20_cluster(
  conn = conn,
  cluster_name = paste(cluster_name,id,sep = "-"),
  node_memory = node_memory,
  num_nodes = num_nodes,
  h20_version = h2o_version,
  yarn_queue = 'yourQueue',
  max_idle_time = 8,
  max_uptime = 14,
  profile_name = 'default-h2o'

  )

h2o.connect(config = conf$connect_params)
h2o.ls()


Read In Data

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#--| Read In Data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~

des = read.hive('desc example_schema.your_database')$col_name #read.hive being custom function to read data from Hadoop
hdfsDatabase_loc = paste0('hdfs://location/for/your/database')

base_table_2023 = h2o.importFile(path = hdfsDatabase_loc, destination_frame="datatable_name", col.names = des)

# Perform Column Data Type Manipulation as needed
base_table_2023$col1 = as.factor(base_table_2023$col1)

# Final Creation of Target Variable
base_table_2023$response_var = ifelse(is.na(base_table_2023$response),0,base_table_2023$response)

# Summary of Response by Key Column for reference
col1_sum = h2o.group_by(data = base_table_2023,
                        by = 'col1',
                        sum("response_var"),
                        nrow("response_var", na="ignore"));col1_sum




Data Preparation Before Modeling

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#--| Data Preparation Before Modeling
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# Summary of Class Distribution
minority_class_index_total= base_table_2023[,'response_var'] == 1
minority_class_total = base_table_2023[minority_class_index_total,]
nrow.H2OFrame(minority_class_total); nrow.H2OFrame(base_table_2023);nrow.H2OFrame(minority_class_total)/ nrow.H2OFrame(base_table_2023)

# Create Test and Training Sets
partitions <- h2o.splitFrame(data = base_table_2023, ratios = 0.8, seed = 4321)

dt_train <- partitions[[1]]
dt_test <- partitions[[2]]

minority_class_index = dt_train[,'response_var'] == 1
minority_class = dt_train[minority_class_index,]

majority_class_index =  dt_train[,'response_var'] == 0
majority_class = dt_train[majority_class_index,]
n_split =nrow.H2OFrame(minority_class)

train_sum = h2o.group_by(data = dt_train, by = "response_var", nrow("response_var"));train_sum
n_total = nrow.H2OFrame(dt_train)


# Example Balancing of Classes for Imbalanced Data
sample_majority_class = h2o.splitFrame(data = majority_class, ratios = n_split/n_total, seed = 4321)

sample_majority_class = sample_majority_class[[1]]

h2o.nrow(sample_majority_class)
h2o.describe(sample_majority_class)


combined_balanced = h2o.rbind(minority_class, sample_majority_class)

# Data Checks
balanced_sum = h2o.group_by(data = combined_balanced, by = "response_var", nrow("response_var"));balanced_sum
total_n = h2o.nrow(combined_balanced)

combined_balanced$response_var <- as.factor(combined_balanced$response_var)

df_splits <- h2o.splitFrame(data =combined_balanced, ratios = 0.8, seed = 4321)

train <- df_splits[[1]]
test <- df_splits[[2]]

predictors <- colnames(train[ ,!(colnames(train) == "response_var")])
response <- "response_var"

Model Construction and Evaluation

Logistic Regression

#~~~~~~~~~~~~~~~~~~~~~~~~
#--| Logistic Regression
#~~~~~~~~~~~~~~~~~~~~~~~~
#https://docs.h2o.ai/h2o/latest-stable/h2o-docs/training-models.html#classification-example

glm_fit1 <- h2o.glm(family = "binomial",
                    x = predictors,
                    y = response,
                    training_frame = train,
                    lambda = 0,
                    compute_p_values = TRUE,
                    balance_classes = TRUE,
                    remove_collinear_columns = TRUE,
                    nfolds = 5)

# AUC of cross-validated holdout predictions
h2o.auc(glm_fit1, xval = TRUE)

# Prediction
glm_fit1_predict <- h2o.predict(object = glm_fit1, newdata = test)
h2o.head(glm_fit1_predict)

#Check Coefficient & Summary
#https://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html

h2o.coef(glm_fit1)

# Coefficients fitted on the standardized data (requires standardize=TRUE, which is on by default)
h2o.coef_norm(glm_fit1)

# Print the coefficients table
glm_fit1_summary <- glm_fit1@model$coefficients_table

# If Interested in only significant coefficients
#glm_fit1_summary$names[glm_fit1_summary$p_value <= 0.05]


h2o.varimp_plot(glm_fit1)
glm_fit1_varimp <- h2o.varimp(glm_fit1)

glm_fit1_perf <- h2o.performance(glm_fit1, test)
glm_fit1_perf

glm_fit1_perf_original <- h2o.performance(glm_fit1, dt_test)
glm_fit1_perf_original

pred_vs_actual_glm <- as.data.frame(h2o.cbind(test$response_var,glm_fit1_predict$predict, glm_fit1_predict$p1))



write.csv(glm_fit1_perf@metrics$cm$table,'lr_cm.csv')
write.csv(glm_fit1_summary, 'logregressionh20.csv')

Random Forest

#~~~~~~~~~~~~~~~~~~~~~~~~
#--| Random Forest
#~~~~~~~~~~~~~~~~~~~~~~~~

# http://h2o-release.s3.amazonaws.com/h2o/rel-yu/2/docs-website/h2o-docs/data-science/drf.html

rf_fit1 <- h2o.randomForest(x = predictors,
                            y = response,
                            training_frame = train,
                            nfolds = 5,
                            balance_classes = TRUE,
                            seed = 1234)

# AUC of cross-validated holdout predictions
h2o.auc(rf_fit1, xval = TRUE)

# Prediction
rf_fit1_predict <- h2o.predict(object = rf_fit1, newdata = test)
h2o.head(rf_fit1_predict)

# Model performance
rf_fit1_summary <- rf_fit1@model$summary
h2o.varimp_plot(rf_fit1)
rf_fit1_varimp <- h2o.varimp(rf_fit1)

rf_fit1_perf <- h2o.performance(rf_fit1, test)
rf_fit1_perf

rf_fit1_perf_orig <- h2o.performance(rf_fit1, dt_test)
rf_fit1_perf_orig

pred_vs_actual_rf <- as.data.frame(h2o.cbind(test$response_var,rf_fit1_predict$predict, rf_fit1_predict$p1))


write.csv(rf_fit1_perf@metrics$cm$table, 'rf_cm.csv')
write.csv(rf_fit1_varimp, 'rf_varimph2o.csv')

XGBoost

#~~~~~~~~~~~~~
#--| XGBoost
#~~~~~~~~~~~~~
xgb_fit1 <- h2o.xgboost(x = predictors,
                       y = response,
                       training_frame = train,
                       #validation_frame = test,
                       booster = "dart",
                       normalize_type = "tree",
                       seed = 1234)

# AUC of cross-validated holdout predictions
#h2o.auc(xgb_fit1, xval = TRUE)
h2o.performance(xgb_fit1)

# Prediction
xgb_fit1_predict <- h2o.predict(object = xgb_fit1, newdata = test)
h2o.head(xgb_fit1_predict)

# Model performance
xgb_fit1_summary <- xgb_fit1@model$summary
h2o.varimp_plot(xgb_fit1)
xgb_fit1_varimp <- h2o.varimp(xgb_fit1)

xgb_fit1_perf <- h2o.performance(xgb_fit1, test)
xgb_fit1_perf

xgb_fit1_perf_orig <- h2o.performance(xgb_fit1, dt_test)
xgb_fit1_perf_orig

pred_vs_actual_xgb <- as.data.frame(h2o.cbind(test$response_var,xgb_fit1_predict$predict, xgb_fit1_predict$p1))

write.csv(xgb_fit1_perf@metrics$cm$table, 'xgb_cm.csv')
write.csv(xgb_fit1_varimp, 'xgb_varimph2o.csv')


Lift and Gain Assessment

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#--| Lift and Gain Calculation
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
lift <- function(depVar, predCol, ratingCol, groups=10) {
  # Function built for the purpose of calculating Lift and Gain of predictive model

  if(is.factor(depVar)) depVar <- as.integer(as.character(depVar))
  if(is.factor(predCol)) predCol <- as.integer(as.character(predCol))

  helper = data.frame(cbind(depVar, predCol, ratingCol))
  helper = as.data.frame(helper %>% mutate(rank = dense_rank(desc(ratingCol))) %>% arrange(rank))
  helper[,"bucket"] =ntile(-helper[,"predCol"], groups)

  gaintable = helper %>% group_by(bucket)  %>%
    summarise_at(vars(depVar), funs(total = n(),
                                    totalresp=sum(., na.rm = TRUE))) %>%
    mutate(Cumresp = cumsum(totalresp),
           Gain=Cumresp/sum(totalresp)*100,
           Cumlift=Gain/(bucket*(100/groups)))

  return(gaintable)

}

# Logistic Regression
dt_glm = lift(pred_vs_actual_glm$response_var , pred_vs_actual_glm$predict, pred_vs_actual_glm$p1, groups = 10)
dt_glm
# Random Forest
dt_rf = lift(pred_vs_actual_rf$response_var , pred_vs_actual_rf$predict, pred_vs_actual_rf$p1, groups = 10)
dt_rf
# XGBoost
dt_xgb = lift(pred_vs_actual_xgb$response_var , pred_vs_actual_xgb$predict, pred_vs_actual_xgb$p1, groups = 10)
dt_xgb

# Example Gain Chart
ggplot(data = dt_xgb, aes(x = bucket))+
  geom_line(aes(y = Gain/100), color = "darkred") +
  geom_abline(slope=1, intercept=0) +
  ggtitle("Gain Chart for XGBoost") +
  xlab("Decile") + ylab("Gain")

# Example Lift Chart
ggplot(data = dt_xgb, aes(x = bucket))+
  geom_line(aes(y = Cumlift), color = "darkred") +
  ggtitle("Lift Chart for XGBoost") +
  xlab("Percentile") + ylab("Lift")