# Load necessary packages

library("tidyverse") # Metapackage of all tidyverse packages
library("conflicted") # Manages conflicts
library("readxl") # Reads Excel files

conflict_prefer("filter", "dplyr") # Setting filter as default choice
conflict_prefer("lag", "dplyr") # Setting lag as default choice

# Import the datasets for each month

options(warn=-1)

y2023_01 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.01-divvy-tripdata.xlsx")
y2023_02 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.02-divvy-tripdata.xlsx")
y2023_03 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.03-divvy-tripdata.xlsx")
y2023_04 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.04-divvy-tripdata.xlsx")
y2023_05 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.05-divvy-tripdata.xlsx")
y2023_06 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.06-divvy-tripdata.xlsx")
y2023_07 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.07-divvy-tripdata.xlsx")
y2023_08 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.08-divvy-tripdata.xlsx")
y2023_09 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.09-divvy-tripdata.xlsx")
y2023_10 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.10-divvy-tripdata.xlsx")
y2023_11 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.11-divvy-tripdata.xlsx")
y2023_12 <- read_excel("/kaggle/input/divvy-tripdata-2023/2023.12-divvy-tripdata.xlsx")

# Convert end_lat column from character to numeric for the selected datasets to maintain consistency

y2023_01 <- mutate(y2023_01, end_lat = as.numeric(end_lat))
y2023_02 <- mutate(y2023_02, end_lat = as.numeric(end_lat))
y2023_03 <- mutate(y2023_03, end_lat = as.numeric(end_lat))
y2023_04 <- mutate(y2023_04, end_lat = as.numeric(end_lat))
y2023_05 <- mutate(y2023_05, end_lat = as.numeric(end_lat))
y2023_07 <- mutate(y2023_07, end_lat = as.numeric(end_lat))
y2023_08 <- mutate(y2023_08, end_lat = as.numeric(end_lat))
y2023_09 <- mutate(y2023_09, end_lat = as.numeric(end_lat))
y2023_10 <- mutate(y2023_10, end_lat = as.numeric(end_lat))
y2023_11 <- mutate(y2023_11, end_lat = as.numeric(end_lat))
y2023_12 <- mutate(y2023_12, end_lat = as.numeric(end_lat))

# Combine tables into a single table

all_trips <- bind_rows(y2023_01,
                       y2023_02,
                       y2023_03,
                       y2023_04,
                       y2023_05,
                       y2023_06,
                       y2023_07,
                       y2023_08,
                       y2023_09,
                       y2023_10,
                       y2023_11,
                       y2023_12)

#Remove latitude and longitude fields as this data was dropped beginning in 2020

all_trips <- all_trips %>%
  select(-c(start_lat,start_lng,end_lat,end_lng))

 # Statistical summary of data, mainly for numerics

summary(all_trips)

#See list of columns and data types (numeric, character, etc)

str(all_trips)

# Add columns that list the date, month, day, year, and day of week for each ride

all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd

all_trips$month <- format(as.Date(all_trips$date), "%m")

all_trips$day <- format(as.Date(all_trips$date), "%d")

all_trips$year <- format(as.Date(all_trips$date), "%Y")

all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")

# Add a "ride_length" calculation to all_trips (in seconds) using difftime function

all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)

# Convert "ride_length" from factor to numeric

all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))

# Remove NA values 

all_trips_no_na <- drop_na(all_trips)

# There were a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative, using OR operator 

all_trips_v2 <- all_trips_no_na[!(all_trips_no_na$start_station_name == "HQ QR" | all_trips_no_na$ride_length<0),]

# All figures are measured in seconds

mean(all_trips_v2$ride_length) # (total ride length / rides)

median(all_trips_v2$ride_length) # Midpoint number 

max(all_trips_v2$ride_length) # Longest ride

min(all_trips_v2$ride_length) # Shortest ride

# Summary() function of a specific attribute, in this case ride_length, condenses the four lines above into a single line

summary(all_trips_v2$ride_length)

# Compare members with casual users

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)

aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)

# Reordering days of the week

all_trips_v2$day_of_week <- ordered(all_trips_v2$day_of_week, levels=c("Sunday", 
    "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

# Average ride time by rider type by day converted to minutes

aggregate(all_trips_v2$ride_length/60 ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)

# Use the wday() function to create a weekday field
# Group the data by rider type and weekday
# Calculate the total number of rides (counts every single observation in the group)
# Calculate average duration or ride length and converts into minutes
# Sort the output by rider type and weekday

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>%   
  summarise(number_of_rides = n() 
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(member_casual, weekday)

options(scipen = 999)
all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  scale_fill_manual(values = c("#008b8b","#F0E442")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 1: Number of Rides by Day and Rider Type", fill = " ") + 
  ylab("Number of Rides") + 
  xlab("Day of Week") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(rideable_type, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(rideable_type, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = rideable_type)) +
  scale_fill_manual(values = c("#008b8b","#F0E442", "#56B4E9")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 2: Number of Rides by Day and Bike Type", fill = " ") + 
  ylab("Number of Rides") + 
  xlab("Day of Week") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(member_casual, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(member_casual, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
  scale_fill_manual(values = c("#008b8b","#F0E442")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 3: Average Ride Duration by Day and Rider Type", fill = " ") + 
  ylab("Average Duration (minutes)") + 
  xlab("Day of Week") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  mutate(weekday = wday(started_at, label = TRUE)) %>% 
  group_by(rideable_type, weekday) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(rideable_type, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = rideable_type)) +
  scale_fill_manual(values = c("#008b8b", "#F0E442", "#56B4E9")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 4: Average Ride Duration by Day and Bike Type", fill = " ") + 
  ylab("Average Duration (minutes)") + 
  xlab("Day of Week") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  group_by(member_casual, month) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(member_casual, month)  %>% 
  ggplot(aes(x = month, y = number_of_rides, fill = member_casual)) +
  scale_fill_manual(values = c("#008b8b", "#F0E442")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 5: Number of Rides by Month and Rider Type", fill = " ") + 
  ylab("Number of Rides") + 
  xlab("Month") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  group_by(rideable_type, month) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(rideable_type, month)  %>% 
  ggplot(aes(x = month, y = number_of_rides, fill = rideable_type)) +
  scale_fill_manual(values = c("#008b8b", "#F0E442", "#56B4E9")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 6: Number of Rides by Month and Bike Type", fill = " ") + 
  ylab("Number of Rides") + 
  xlab("Month") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  group_by(member_casual, month) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(member_casual, month)  %>% 
  ggplot(aes(x = month, y = average_duration, fill = member_casual)) +
  scale_fill_manual(values = c("#008b8b", "#F0E442")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 7: Average Ride Duration by Month and Rider Type", fill = " ") + 
  ylab("Average Duration (minutes)") + 
  xlab("Month") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  group_by(rideable_type, month) %>% 
  summarise(number_of_rides = n()
            ,average_duration = mean(ride_length/60)) %>% 
  arrange(rideable_type, month)  %>% 
  ggplot(aes(x = month, y = average_duration, fill = rideable_type)) +
  scale_fill_manual(values = c("#008b8b", "#F0E442", "#56B4E9")) +
  geom_col(position = "dodge") + 
  labs(title = "Figure 8: Average Ride Duration by Month and Bike Type", fill = " ") + 
  ylab("Average Duration (minutes)") + 
  xlab("Month") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

all_trips_v2 %>% 
  group_by(rideable_type, member_casual) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(rideable_type, member_casual)  %>% 
  ggplot(aes(x = member_casual, y = number_of_rides, fill = rideable_type)) +
  geom_col(position = "dodge") + 
  scale_fill_manual(values = c("#008b8b", "#F0E442", "#56B4E9")) + 
  labs(title = "Figure 9: Number of Rides by Bike Type and Rider Type", fill = " ") + 
  ylab("Number of Rides") + 
  xlab("Rider Type") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.title.y = element_text(vjust = 2))

# Create a new dataframe with the relevant aggregation and write a .csv file

counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
write.csv(counts, file = 'avg_ride_length.csv')

Table of Contents¶

1. Overview¶

2. Scenario¶

3. Ask¶

3.1 Business Task¶

3.2 Key Stakeholders¶

4. Prepare¶

5. Process¶

5.1 Establish Workspace¶

5.2 Data Import¶

5.3 Data Cleaning¶

6. Analyze¶

6.1 Descriptive Analysis¶

6.2 Key Takeaways¶

Figure 1: Number of Rides by Day and Rider Type¶

Findings¶

Figure 2: Number of Rides by Day and Bike Type¶

Findings¶

Figure 3: Average Ride Duration by Day and Rider Type¶

Findings¶

Figure 4: Average Ride Duration by Day and Bike Type¶

Findings¶

Figure 5: Number of Rides by Month and Rider Type¶

Findings¶

Figure 6: Number of Rides by Month and Bike Type¶

Findings¶

Figure 7: Average Ride Duration by Month and Rider Type¶

Findings¶

Figure 8: Average Ride Duration by Month and Bike Type¶

Findings¶

Figure 9: Number of Rides by Bike Type and Rider Type¶

Findings¶

8. Act¶

8.1 Recommendations¶

8.2 Conclusion¶

Case study: How does a bike-share navigate speedy success?¶

Table of Contents¶

1. Overview¶

2. Scenario¶

3. Ask¶

3.1 Business Task¶

3.2 Key Stakeholders¶

4. Prepare¶

5. Process¶

5.1 Establish Workspace¶

5.2 Data Import¶

5.3 Data Cleaning¶

6. Analyze¶

6.1 Descriptive Analysis¶

6.2 Key Takeaways¶

7. Share¶

Figure 1: Number of Rides by Day and Rider Type¶

Findings¶

Figure 2: Number of Rides by Day and Bike Type¶

Findings¶

Figure 3: Average Ride Duration by Day and Rider Type¶

Findings¶

Figure 4: Average Ride Duration by Day and Bike Type¶

Findings¶

Figure 5: Number of Rides by Month and Rider Type¶

Findings¶

Figure 6: Number of Rides by Month and Bike Type¶

Findings¶

Figure 7: Average Ride Duration by Month and Rider Type¶

Findings¶

Figure 8: Average Ride Duration by Month and Bike Type¶

Findings¶

Figure 9: Number of Rides by Bike Type and Rider Type¶

Findings¶

8. Act¶

8.1 Recommendations¶

8.2 Conclusion¶