# missing rm(list=ls())
# Load libraries (assumes you have packages already installed)
library(data.table)
# library(tidyverse) had to comment out --> package 2.2.0 is required but 2.1.1 is being loaded
library(dplyr) # missing
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
library(ggplot2) # missing
# Column classes
#id, vendor id, prickup datetme, dropoff, passenger count, pickup longitude, latitude,
#drop off longitude, latitude, store_and_fwd_flag, trip duration
train_classes <- c("factor", "factor", "character", "character", "integer",
"numeric", "numeric", "numeric", "numeric",
"factor", "numeric")
test_classes <- c("factor", "factor", "character", "integer",
"numeric", "numeric", "numeric", "numeric", "factor")
# Read in the data
train <- read.csv("train.csv")
test <- read.csv("test.csv")
names(train) # feature names
## [1] "id" "vendor_id" "pickup_datetime"
## [4] "dropoff_datetime" "passenger_count" "pickup_longitude"
## [7] "pickup_latitude" "dropoff_longitude" "dropoff_latitude"
## [10] "store_and_fwd_flag" "trip_duration"
summary(train)
## id vendor_id pickup_datetime
## id0000001: 1 Min. :1.000 2016-01-12 18:48:44: 5
## id0000003: 1 1st Qu.:1.000 2016-02-09 21:03:38: 5
## id0000005: 1 Median :2.000 2016-03-04 08:07:34: 5
## id0000008: 1 Mean :1.535 2016-04-05 18:55:21: 5
## id0000009: 1 3rd Qu.:2.000 2016-05-07 13:18:07: 5
## id0000011: 1 Max. :2.000 2016-06-10 23:17:17: 5
## (Other) :1458638 (Other) :1458614
## dropoff_datetime passenger_count pickup_longitude
## 2016-02-19 19:25:04: 5 Min. :0.000 Min. :-121.93
## 2016-05-16 19:40:28: 5 1st Qu.:1.000 1st Qu.: -73.99
## 2016-01-07 08:04:32: 4 Median :1.000 Median : -73.98
## 2016-01-08 12:43:38: 4 Mean :1.665 Mean : -73.97
## 2016-01-08 13:00:41: 4 3rd Qu.:2.000 3rd Qu.: -73.97
## 2016-01-09 15:59:42: 4 Max. :9.000 Max. : -61.34
## (Other) :1458618
## pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
## Min. :34.36 Min. :-121.93 Min. :32.18 N:1450599
## 1st Qu.:40.74 1st Qu.: -73.99 1st Qu.:40.74 Y: 8045
## Median :40.75 Median : -73.98 Median :40.75
## Mean :40.75 Mean : -73.97 Mean :40.75
## 3rd Qu.:40.77 3rd Qu.: -73.96 3rd Qu.:40.77
## Max. :51.88 Max. : -61.34 Max. :43.92
##
## trip_duration
## Min. : 1
## 1st Qu.: 397
## Median : 662
## Mean : 959
## 3rd Qu.: 1075
## Max. :3526282
##
sample <- train %>%
mutate(pickup_hour = hour(ymd_hms(pickup_datetime))) %>% #mutate will convert pickup_datetime into total hours
sample_n(10000)
# really interesting plot --> visual adds value
ggplot(sample, aes(x = pickup_hour, y = log(trip_duration))) +
geom_point(position = "jitter",
alpha = 0.25) +
geom_smooth() +
labs(x = "Hour of Pickup",
y = "Log of Trip Duration",
title = "Trip Duration by Pickup Hour")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

library(randomForest) # include randomForest package (assumes you already have installed)
## Warning: package 'randomForest' was built under R version 3.6.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# perform model fit using randomForest using trip_duration on vendor_id, passenger_count, pickup_longitude and pickup_latitude features
rf_benchmark <- randomForest(trip_duration ~ vendor_id + passenger_count
+ pickup_longitude + pickup_latitude,
data = sample,
ntree = 100)
rf_prediction <- predict(rf_benchmark, test, type = "response")
# Prepare the submission file and write it to the "Output" directory
#submission_file <- data.frame(id = test$id,
# trip_duration = rf_prediction)
#write.csv(submission_file, "submission_file.csv", row.names=F)