Load required packages
needed <- c("data.table", "tidyverse","lubridate","gridExtra","ggplot2","corrplot")
Read Dataset
train <- read.csv("TaxiTrainData.csv")
test <- read.csv("TaxiTestData.csv")
count(train)
## # A tibble: 1 x 1
## n
## <int>
## 1 1048575
1,458,641 data points in train dataset
Improvement 1: Data Cleaning
1.1 Look for missing values
sum(is.na(train))
## [1] 0
sum(is.na(test))
## [1] 0
Examine Train Dataset
summary(train)
## id vendor_id pickup_datetime
## id0000001: 1 Min. :1.000 4/23/2016 21:23: 21
## id0000003: 1 1st Qu.:1.000 3/4/2016 20:19 : 19
## id0000005: 1 Median :2.000 4/4/2016 20:39 : 19
## id0000008: 1 Mean :1.535 6/5/2016 11:23 : 18
## id0000011: 1 3rd Qu.:2.000 1/13/2016 22:37: 17
## id0000013: 1 Max. :2.000 1/14/2016 18:44: 17
## (Other) :1048569 (Other) :1048464
## dropoff_datetime passenger_count pickup_longitude
## 1/21/2016 22:52: 19 Min. :0.000 Min. :-78.55
## 1/30/2016 23:30: 18 1st Qu.:1.000 1st Qu.:-73.99
## 3/4/2016 20:40 : 18 Median :1.000 Median :-73.98
## 4/28/2016 19:40: 18 Mean :1.664 Mean :-73.97
## 6/7/2016 8:55 : 18 3rd Qu.:2.000 3rd Qu.:-73.97
## 1/29/2016 20:36: 17 Max. :9.000 Max. :-61.34
## (Other) :1048467
## pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
## Min. :34.36 Min. :-79.82 Min. :32.18 N:1042766
## 1st Qu.:40.74 1st Qu.:-73.99 1st Qu.:40.74 Y: 5809
## Median :40.75 Median :-73.98 Median :40.75
## Mean :40.75 Mean :-73.97 Mean :40.75
## 3rd Qu.:40.77 3rd Qu.:-73.96 3rd Qu.:40.77
## Max. :51.88 Max. :-61.34 Max. :43.91
##
## trip_duration
## Min. : 1
## 1st Qu.: 397
## Median : 662
## Mean : 962
## 3rd Qu.: 1075
## Max. :3526282
##
1.2 Look for value that doesn’t make sense
trip_duration is measured in seconds
day_plus_trips <- train %>% filter(trip_duration >= 12*60*60)
count(day_plus_trips)
## # A tibble: 1 x 1
## n
## <int>
## 1 1440
1993 observations
less_than_1_min_trips <- train %>%filter(trip_duration <= 60)
count(less_than_1_min_trips)
## # A tibble: 1 x 1
## n
## <int>
## 1 6315
8777 observations
get rid of day_plus_trips & less_than_1_min_trips
reduce size by 0.7%
train=train[!row.names(train) %in% row.names(day_plus_trips),]
train=train[!row.names(train) %in% row.names(less_than_1_min_trips),]
1.3 Fix Date Variables
train = as.data.table(train)
train[,pickup_datetime:=as.Date(pickup_datetime)]
train[,dropoff_datetime:=as.Date(dropoff_datetime)]
train[,":="(
pickup_yday=yday(pickup_datetime)
,pickup_mday=mday(pickup_datetime)
)]
train$pickup_week <- week(train$pickup_datetime)
train$pickup_month <- month(train$pickup_datetime)
train$pickup_weekdays <- weekdays(train$pickup_datetime)
train$pickup_weekend <- ifelse(train$pickup_weekdays==1 | train$pickup_weekdays==7,"Weekend","not-Weekend")
Plot
1.3.1 Mean trip duration by month
plot1 = train[, list(mean_trip_duration= mean(trip_duration)), by=pickup_datetime] %>%
ggplot(aes(x=pickup_datetime, y=mean_trip_duration)) +
geom_bar(stat='identity', fill='steelblue') +
labs(x='', y='Mean Trip Duration', title='Mean Trip Duration over time')
grid.arrange(plot1)
## Warning: Removed 1 rows containing missing values (position_stack).
1.3.2 Mean trip duration by weekdays
plot2 <-train[, list(mean_trip_duration = mean(trip_duration)), by = pickup_weekdays] %>%
ggplot(aes(x = pickup_weekdays, y = mean_trip_duration)) +
geom_bar(stat = 'identity', fill = 'steelblue') +
labs(x = 'Month', y = 'Mean Trip Duration', title = 'Mean Trip duration by weekdays')
grid.arrange(plot2)
1.4 correlation matrix
corr_features = train[,.(vendor_id, pickup_week, pickup_month,pickup_yday, pickup_mday,
passenger_count,pickup_longitude, pickup_latitude, dropoff_longitude,
dropoff_latitude, trip_duration)]
corrplot(cor(corr_features, use='complete.obs'), type='lower')
## Warning in cor(corr_features, use = "complete.obs"): the standard deviation
## is zero
Load Necessary Packages
library(gbm)
library(ModelMetrics)
Load Data
data = read.csv("TaxiTrainData.csv")
train = data[sample(1:nrow(train),14000, replace=FALSE),]
test = data[sample(1:nrow(train),6000, replace=FALSE),]
Fit boosted regression trees to the trainining data
set.seed(1)
boost.taxi = gbm(trip_duration ~ + passenger_count + pickup_latitude + pickup_longitude + dropoff_latitude + dropoff_longitude, data = train, distribution = "gaussian",n.trees = 5000, interaction.depth = 4)
summary(boost.taxi)
## var rel.inf
## dropoff_latitude dropoff_latitude 32.33414
## dropoff_longitude dropoff_longitude 26.34861
## pickup_latitude pickup_latitude 18.69318
## pickup_longitude pickup_longitude 17.49594
## passenger_count passenger_count 5.12813
Use boosted model to predict trip duration
yhat.boost = predict(boost.taxi, newdata = test, n.trees = 5000, type = "response")
Calculate error rates
(mse = mean((yhat.boost - test$trip_duration)^2))
## [1] 8374855
(error = sqrt(mse))
## [1] 2893.934
yhat.boost = abs(yhat.boost)
rmsle(test$trip_duration, yhat.boost)
## [1] 1.094167