### ( ) features<-c("vendor_id", "passenger_count", "trip_time_in_secs", "trip_distance", "fare_amount", "surcharge") rmse <- function(formula, data) { data %>% mutate_(residual = formula) %>% summarize(rmse = sqr(mean(residual ^ 2))) %>% collect %>% .[["rmse"]] } trips_train_tbl <- sdf_register(taxi_filtered$training, "trips_train") trips_test_tbl <- sdf_register(taxi_filtered$test, "trips_test") actual <- trips.test.tbl %>% select(tip_amount) %>% collect() %>% `[[`("tip_amount") tbl_cache(sc, "trips_train") tbl_cache(sc, "trips_test") trips_train_h2o_tbl <- as_h2o_frame(sc, trips_train_tbl) trips_test_h2o_tbl <- as_h2o_frame(sc, trips_test_tbl) trips_train_h2o_tbl$vendor_id <- as.factor(trips_train_h2o_tbl$vendor_id) trips_test_h2o_tbl$vendor_id <- as.factor(trips_test_h2o_tbl$vendor_id) #mllib lm_mllib <- ml_linear_regression(x=trips_train_tbl, response = "tip_amount", features = features) pred_lm_mllib <- sdf_predict(lm_mllib, trips_test_tbl) rf_mllib <- ml_random_forest(x=trips_train_tbl, response = "tip_amount", features = features) pred_rf_mllib <- sdf_predict(rf_mllib, trips_test_tbl) gbm_mllib <-ml_gradient_boosted_trees(x=trips_train_tbl, response = "tip_amount", features = features) pred_gbm_mllib <- sdf_predict(gbm_mllib, trips_test_tbl) #h2o lm_h2o <- h2o.glm(x =features, y = "tip_amount", trips_train_h2o_tbl) pred_lm_h2o <- h2o.predict(lm_h2o, trips_test_h2o_tbl) rf_h2o <- h2o.randomForest(x =features, y = "tip_amount", trips_train_h2o_tbl,ntrees=20,max_depth=5) pred_rf_h2o <- h2o.predict(rf_h2o, trips_test_h2o_tbl) gbm_h2o <- h2o.gbm(x =features, y = "tip_amount", trips_train_h2o_tbl) pred_gbm_h2o <- h2o.predict(gbm_h2o, trips_test_h2o_tbl) #### pred.h2o <- data.frame( tip.amount = actual, as.data.frame(pred_lm_h2o), as.data.frame(pred_rf_h2o), as.data.frame(pred_gbm_h2o), ) colnames(pred.h2o)<-c("tip.amount", "lm", "rf", "gbm") result <- data.frame( RMSE = c( lm.mllib = rmse(~ tip_amount - prediction, pred_lm_mllib), lm.h2o = rmse(~ tip.amount - lm, pred.h2o ), rf.mllib = rmse(~ tip.amount - prediction, pred_rf_mllib), rf.h2o = rmse(~ tip_amount - rf, pred.h2o), gbm.mllib = rmse(~ tip_amount - prediction, pred_gbm_mllib), gbm.h2o = rmse(~ tip.amount - gbm, pred.h2o) ) )