The following code is from Kaggle: https://www.kaggle.com/mrisdal/last-place-laura-benchmark/output

# missing rm(list=ls())
# Load libraries (assumes you have packages already installed)
library(data.table)
# library(tidyverse) had to comment out --> package 2.2.0 is required but 2.1.1 is being loaded
library(dplyr)  # missing
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
library(ggplot2)  # missing

# Column classes
#id, vendor id, prickup datetme, dropoff, passenger count, pickup longitude, latitude,
#drop off longitude, latitude, store_and_fwd_flag, trip duration
train_classes <- c("factor", "factor", "character", "character", "integer",
                   "numeric", "numeric", "numeric", "numeric", 
                   "factor", "numeric")

test_classes <- c("factor", "factor", "character", "integer",
                  "numeric", "numeric", "numeric", "numeric", "factor")

# Read in the data
train <- read.csv("train.csv")
test <- read.csv("test.csv")


names(train) # feature names
##  [1] "id"                 "vendor_id"          "pickup_datetime"   
##  [4] "dropoff_datetime"   "passenger_count"    "pickup_longitude"  
##  [7] "pickup_latitude"    "dropoff_longitude"  "dropoff_latitude"  
## [10] "store_and_fwd_flag" "trip_duration"
summary(train)
##          id            vendor_id                pickup_datetime   
##  id0000001:      1   Min.   :1.000   2016-01-12 18:48:44:      5  
##  id0000003:      1   1st Qu.:1.000   2016-02-09 21:03:38:      5  
##  id0000005:      1   Median :2.000   2016-03-04 08:07:34:      5  
##  id0000008:      1   Mean   :1.535   2016-04-05 18:55:21:      5  
##  id0000009:      1   3rd Qu.:2.000   2016-05-07 13:18:07:      5  
##  id0000011:      1   Max.   :2.000   2016-06-10 23:17:17:      5  
##  (Other)  :1458638                   (Other)            :1458614  
##             dropoff_datetime   passenger_count pickup_longitude 
##  2016-02-19 19:25:04:      5   Min.   :0.000   Min.   :-121.93  
##  2016-05-16 19:40:28:      5   1st Qu.:1.000   1st Qu.: -73.99  
##  2016-01-07 08:04:32:      4   Median :1.000   Median : -73.98  
##  2016-01-08 12:43:38:      4   Mean   :1.665   Mean   : -73.97  
##  2016-01-08 13:00:41:      4   3rd Qu.:2.000   3rd Qu.: -73.97  
##  2016-01-09 15:59:42:      4   Max.   :9.000   Max.   : -61.34  
##  (Other)            :1458618                                    
##  pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
##  Min.   :34.36   Min.   :-121.93   Min.   :32.18    N:1450599         
##  1st Qu.:40.74   1st Qu.: -73.99   1st Qu.:40.74    Y:   8045         
##  Median :40.75   Median : -73.98   Median :40.75                      
##  Mean   :40.75   Mean   : -73.97   Mean   :40.75                      
##  3rd Qu.:40.77   3rd Qu.: -73.96   3rd Qu.:40.77                      
##  Max.   :51.88   Max.   : -61.34   Max.   :43.92                      
##                                                                       
##  trip_duration    
##  Min.   :      1  
##  1st Qu.:    397  
##  Median :    662  
##  Mean   :    959  
##  3rd Qu.:   1075  
##  Max.   :3526282  
## 
sample <- train %>% 
  mutate(pickup_hour = hour(ymd_hms(pickup_datetime))) %>%   #mutate will convert pickup_datetime into total hours
  sample_n(10000)


# really interesting plot --> visual adds value
ggplot(sample, aes(x = pickup_hour, y = log(trip_duration))) +
  geom_point(position = "jitter",
             alpha = 0.25) +
  geom_smooth() +
  labs(x = "Hour of Pickup", 
       y = "Log of Trip Duration",
       title = "Trip Duration by Pickup Hour")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

library(randomForest) # include randomForest package (assumes you already have installed)
## Warning: package 'randomForest' was built under R version 3.6.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
# perform model fit using randomForest using trip_duration on vendor_id, passenger_count, pickup_longitude and pickup_latitude features
rf_benchmark <- randomForest(trip_duration ~ vendor_id + passenger_count
                             + pickup_longitude + pickup_latitude,
                             data = sample,
                             ntree = 100)

rf_prediction <- predict(rf_benchmark, test, type = "response") 



# Prepare the submission file and write it to the "Output" directory
#submission_file <- data.frame(id = test$id,
#                             trip_duration = rf_prediction)
#write.csv(submission_file, "submission_file.csv", row.names=F)