Load required packages

needed <- c("data.table", "tidyverse","lubridate","gridExtra","ggplot2","corrplot") 

Read Dataset

train <- read.csv("TaxiTrainData.csv")
test <- read.csv("TaxiTestData.csv")
count(train)
## # A tibble: 1 x 1
##         n
##     <int>
## 1 1048575

1,458,641 data points in train dataset

Improvement 1: Data Cleaning

1.1 Look for missing values

sum(is.na(train))
## [1] 0
sum(is.na(test))
## [1] 0

Examine Train Dataset

summary(train)
##          id            vendor_id            pickup_datetime   
##  id0000001:      1   Min.   :1.000   4/23/2016 21:23:     21  
##  id0000003:      1   1st Qu.:1.000   3/4/2016 20:19 :     19  
##  id0000005:      1   Median :2.000   4/4/2016 20:39 :     19  
##  id0000008:      1   Mean   :1.535   6/5/2016 11:23 :     18  
##  id0000011:      1   3rd Qu.:2.000   1/13/2016 22:37:     17  
##  id0000013:      1   Max.   :2.000   1/14/2016 18:44:     17  
##  (Other)  :1048569                   (Other)        :1048464  
##         dropoff_datetime   passenger_count pickup_longitude
##  1/21/2016 22:52:     19   Min.   :0.000   Min.   :-78.55  
##  1/30/2016 23:30:     18   1st Qu.:1.000   1st Qu.:-73.99  
##  3/4/2016 20:40 :     18   Median :1.000   Median :-73.98  
##  4/28/2016 19:40:     18   Mean   :1.664   Mean   :-73.97  
##  6/7/2016 8:55  :     18   3rd Qu.:2.000   3rd Qu.:-73.97  
##  1/29/2016 20:36:     17   Max.   :9.000   Max.   :-61.34  
##  (Other)        :1048467                                   
##  pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
##  Min.   :34.36   Min.   :-79.82    Min.   :32.18    N:1042766         
##  1st Qu.:40.74   1st Qu.:-73.99    1st Qu.:40.74    Y:   5809         
##  Median :40.75   Median :-73.98    Median :40.75                      
##  Mean   :40.75   Mean   :-73.97    Mean   :40.75                      
##  3rd Qu.:40.77   3rd Qu.:-73.96    3rd Qu.:40.77                      
##  Max.   :51.88   Max.   :-61.34    Max.   :43.91                      
##                                                                       
##  trip_duration    
##  Min.   :      1  
##  1st Qu.:    397  
##  Median :    662  
##  Mean   :    962  
##  3rd Qu.:   1075  
##  Max.   :3526282  
## 

1.2 Look for value that doesn’t make sense

trip_duration is measured in seconds

day_plus_trips <- train %>% filter(trip_duration >= 12*60*60)
count(day_plus_trips)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  1440

1993 observations

less_than_1_min_trips <- train %>%filter(trip_duration <= 60)
count(less_than_1_min_trips)
## # A tibble: 1 x 1
##       n
##   <int>
## 1  6315

8777 observations

get rid of day_plus_trips & less_than_1_min_trips

reduce size by 0.7%

train=train[!row.names(train) %in% row.names(day_plus_trips),]
train=train[!row.names(train) %in% row.names(less_than_1_min_trips),]

1.3 Fix Date Variables

train = as.data.table(train)
train[,pickup_datetime:=as.Date(pickup_datetime)]
train[,dropoff_datetime:=as.Date(dropoff_datetime)]
train[,":="(
  pickup_yday=yday(pickup_datetime)
  ,pickup_mday=mday(pickup_datetime)
)]
train$pickup_week <- week(train$pickup_datetime)
train$pickup_month <- month(train$pickup_datetime)
train$pickup_weekdays <- weekdays(train$pickup_datetime)
train$pickup_weekend <- ifelse(train$pickup_weekdays==1 | train$pickup_weekdays==7,"Weekend","not-Weekend")

Plot

1.3.1 Mean trip duration by month

plot1 = train[, list(mean_trip_duration= mean(trip_duration)), by=pickup_datetime] %>%
  ggplot(aes(x=pickup_datetime, y=mean_trip_duration)) + 
  geom_bar(stat='identity', fill='steelblue') + 
  labs(x='', y='Mean Trip Duration', title='Mean Trip Duration over time')
grid.arrange(plot1)
## Warning: Removed 1 rows containing missing values (position_stack).

1.3.2 Mean trip duration by weekdays

plot2 <-train[, list(mean_trip_duration = mean(trip_duration)), by = pickup_weekdays] %>%
  ggplot(aes(x = pickup_weekdays, y = mean_trip_duration)) +
  geom_bar(stat = 'identity', fill = 'steelblue') +
  labs(x = 'Month', y = 'Mean Trip Duration', title = 'Mean Trip duration by weekdays')
grid.arrange(plot2)

1.4 correlation matrix

corr_features = train[,.(vendor_id, pickup_week, pickup_month,pickup_yday, pickup_mday,
                         passenger_count,pickup_longitude, pickup_latitude, dropoff_longitude,
                         dropoff_latitude, trip_duration)] 
corrplot(cor(corr_features, use='complete.obs'), type='lower')
## Warning in cor(corr_features, use = "complete.obs"): the standard deviation
## is zero

NYC Taxi Trip Duration Code Improvement: Boosting

Load Necessary Packages

library(gbm)
library(ModelMetrics)

Load Data

data = read.csv("TaxiTrainData.csv")

train = data[sample(1:nrow(train),14000, replace=FALSE),]
test = data[sample(1:nrow(train),6000, replace=FALSE),]

Fit boosted regression trees to the trainining data

set.seed(1)
boost.taxi = gbm(trip_duration ~ + passenger_count + pickup_latitude + pickup_longitude + dropoff_latitude + dropoff_longitude, data = train, distribution = "gaussian",n.trees = 5000, interaction.depth = 4)
summary(boost.taxi)

##                                 var  rel.inf
## dropoff_latitude   dropoff_latitude 32.33414
## dropoff_longitude dropoff_longitude 26.34861
## pickup_latitude     pickup_latitude 18.69318
## pickup_longitude   pickup_longitude 17.49594
## passenger_count     passenger_count  5.12813

Use boosted model to predict trip duration

yhat.boost = predict(boost.taxi, newdata = test, n.trees = 5000, type = "response")

Calculate error rates

(mse = mean((yhat.boost - test$trip_duration)^2))
## [1] 8374855
(error = sqrt(mse))
## [1] 2893.934
yhat.boost = abs(yhat.boost)
rmsle(test$trip_duration, yhat.boost)
## [1] 1.094167