Section 1: Data preparation (Import, transform, sort and filter)


Import libraries

library(tidyverse)
library(stringr)
library(ggpubr)

Define paths to data sets

If you donā€™t keep your data in the same directory as the code, adapt the path names.

dir1 <- "~"
dir2 <- "Desktop"
dir3 <- "AC"
dir4 <- "Useful"
dir5 <- "Carrer"
dir6 <- "Cursos"
dir7 <- "2. Data analysis-Google"
dir8 <- "Capstone Case study 1"
dir9 <- "How I solved it"
dir10 <- "Data - cleaning - sorting - filtering" 
file_name  <- "Version 3"
PSDS_PATH <- file.path(dir1, dir2, dir3, dir4, dir5, dir6, dir7, dir8,dir9,dir10, file_name)

Import csv files as data frames

BikeTrips_2021_11_04_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2021_11_04_v31.csv'))
BikeTrips_2021_11_04_v32 <-  read_csv(file.path(PSDS_PATH, 'BikeTrips_2021_11_04_v32.csv'))
BikeTrips_2021_12_08_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2021_12_08_v3.csv'))
BikeTrips_2022_01_06_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_01_06_v3.csv'))
BikeTrips_2022_02_02_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_02_02_v3.csv'))
BikeTrips_2022_03_02_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_03_02_v3.csv'))
BikeTrips_2022_04_06_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_04_06_v3.csv'))
BikeTrips_2022_05_03_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_05_03_v3.csv'))
BikeTrips_2022_06_03_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_06_03_v31.csv'))
BikeTrips_2022_06_03_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_06_03_v32.csv'))
BikeTrips_2022_07_15_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_07_15_v31.csv'))
BikeTrips_2022_07_15_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_07_15_v32.csv'))
BikeTrips_2022_08_05_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_08_05_v31.csv'))
BikeTrips_2022_08_05_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_08_05_v32.csv'))
BikeTrips_2022_09_08_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_09_08_v31.csv'))
BikeTrips_2022_09_08_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_09_08_v32.csv'))
BikeTrips_2022_10_11_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_10_11_v31.csv'))
BikeTrips_2022_10_11_v32 <-read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_10_11_v32.csv'))

Change column names

To ensure that I could merge the data frames together later, I checked the column names and corrected any spelling mistakes as needed to make them consistent across all the data frames.

colnames(BikeTrips_2021_12_08_v3)[4] <- "started_time"
colnames(BikeTrips_2022_02_02_v3)[4] <- "started_time"
colnames(BikeTrips_2022_02_02_v3)[5] <- "end_date"
colnames(BikeTrips_2022_02_02_v3)[6] <- "end_time"

Merge all data frames into 1

BikeTrips_todos <- rbind(BikeTrips_2021_11_04_v31, BikeTrips_2021_11_04_v32, BikeTrips_2021_12_08_v3, BikeTrips_2022_01_06_v3, BikeTrips_2022_02_02_v3, BikeTrips_2022_03_02_v3, BikeTrips_2022_04_06_v3, BikeTrips_2022_05_03_v3, BikeTrips_2022_06_03_v31, BikeTrips_2022_06_03_v32, BikeTrips_2022_07_15_v31, BikeTrips_2022_07_15_v32, BikeTrips_2022_08_05_v31, BikeTrips_2022_08_05_v32, BikeTrips_2022_09_08_v31, BikeTrips_2022_09_08_v32, BikeTrips_2022_10_11_v31, BikeTrips_2022_10_11_v32)

Transform dataype from chr to date

class(BikeTrips_todos$started_date)
## [1] "character"
BikeTrips_todos$started_date <- mdy(BikeTrips_todos$started_date)
class(BikeTrips_todos$started_date)
## [1] "Date"
class(BikeTrips_todos$end_date)
## [1] "character"
BikeTrips_todos$end_date <- mdy(BikeTrips_todos$end_date)
class(BikeTrips_todos$end_date)
## [1] "Date"

Sort ascending according to started_date and started_time

BikeTrips_todos<- arrange(BikeTrips_todos,started_date,started_time)

Section 2: Data process (clean)


Duplicates

Iā€™ve used the function duplicated and saves the duplicates in a dataframe called duplicates (The Iā€™ds where duplicated but the data was different)

duplicates <-BikeTrips_todos[duplicated(BikeTrips_todos$ride_id)|duplicated(BikeTrips_todos$ride_id, fromLast=TRUE),]

Therefore, I concluded that there were no duplicates.

Blanks or mispellings in CHR columns

Iā€™ve check max and min length of CHR datatypes to find if there were any blanks (below some of them)

max(nchar(BikeTrips_todos$rideable_type))
## [1] 13
min(nchar(BikeTrips_todos$rideable_type))
## [1] 11

The only 3 options are ā€œclassic_bikeā€, ā€œelectric_bikeā€ and ā€œdocked_bikeā€ which have 12, 13 and 11 characters respectively, therefore, I concluded that there were no blanks.

Ranges of NUM columns

Iā€™ve check max and min values of NUM datatypes to find if there were any blanks (below some of them)

max(BikeTrips_todos$start_lng)
## [1] -73.79648
min(BikeTrips_todos$start_lng)
## [1] -87.84

Longitudes could range from -180 to 0 (West) or 0 to 180 (east). Therefore, the longitudes are inside the range. Altough, Chicago longides are between -87.63 to -87.52 degrees West, which means that there are some values out of that range, we will have to decide what to do with those measure later on.

Count values with the length expected

Iā€™ve used this function which is similar to countIF to know how many observations had more than 8 characters

sum(nchar(BikeTrips_todos$rideable_type) > 8, na.rm=TRUE)
## [1] 5528236

All observations must fall within this range, as the ā€œBikeTrips_todosā€ data frame has 5528236 observations, we conclude that the ā€œcountā€ is correct.

Count values with the latitude expected

Iā€™ve used with this function which is similar to countIF to know how many observations had latitudes above 8Āŗ.

sum(BikeTrips_todos$start_lat > 8, na.rm=TRUE)
## [1] 5528236

All observations must fall within this range, as the Chicagoā€™s latitude range is between 41.6ĀŗN and 42ĀŗN, the data frame has 5528236 observations, we conclude that the ā€œcountā€ is correct.

Erase all blanks if there are

Iā€™ve use this function to erase all blank spaces before and after all characters

BikeTrips_todos <- BikeTrips_todos %>% mutate(across(where(is.character), str_trim))

Section 3: Analysis phase (calculations, visualizations, explore data)

1. What was the percentage distribution of bike preferences?


Calculus of % distribution

Iā€™ve created a function called conteo. This function does the following:

  1. Filters the ā€œBikeTrips_todosā€ data frame by casual or member.

  2. Count the rides per type of ride (classic, electric, docked).

  3. Creates a new column with the % calculations.

  4. Scale the percentages in a new column called ā€œlablesā€ as characters to plot later on.

conteo<- function(casual_or_member) {BikeTrips_todos %>%
  filter(member_casual == casual_or_member) %>%
  group_by(rideable_type) %>% # Variable to be transformed
  count(rideable_type)  %>% 
  ungroup() %>% 
  mutate(perc = n / sum(n)) %>% 
  arrange(perc) %>%
  mutate(labels = scales::percent(perc))  }

Apply function and visualize tables

conteoA <- conteo("casual")
conteoB <- conteo("member")
conteoA
## # A tibble: 3 Ɨ 4
##   rideable_type       n   perc labels
##   <chr>           <int>  <dbl> <chr> 
## 1 docked_bike    180575 0.0801 8%    
## 2 classic_bike   872265 0.387  39%   
## 3 electric_bike 1202385 0.533  53%
conteoB
## # A tibble: 2 Ɨ 4
##   rideable_type       n  perc labels
##   <chr>           <int> <dbl> <chr> 
## 1 electric_bike 1565095 0.478 47.8% 
## 2 classic_bike  1707916 0.522 52.2%

Pie chart visualizations

2. How is the service usage distributed over the week?


Order days of the week

Default is for Sunday to be the first day on R, I used the function factor to order the ā€œday_of_weekā€ column

BikeTrips_todos_ordenado_por_dia<-BikeTrips_todos
BikeTrips_todos_ordenado_por_dia$day_of_week <- factor(BikeTrips_todos_ordenado_por_dia$day_of_week, c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))

Counts the number of trips per day of the week

Iā€™ve created a function called day. This function does the following:

  1. Filters the ā€œBikeTrips_todos_ordenado_por_diaā€ data frame by casual or member.

  2. Sort and count the rides per day of the week (monday, tuesday, etc).

  3. Creates a new column with the % calculations.

  4. Scale the percentages in a new column called ā€œlabelsā€ as characters to plot later on.

day <- function(casual_or_member) {BikeTrips_todos_ordenado_por_dia %>%
  filter(member_casual == casual_or_member) %>%
  group_by(day_of_week) %>% # Variable to be transformed
  count(day_of_week)  %>% 
  ungroup() %>% 
  mutate(perc = n / sum(n)) %>% 
  arrange(perc) %>%
  mutate(labels = scales::percent(perc)) } 

Apply function and visualize tables

day_casual <- day("casual")
day_member <- day("member")
day_casual
## # A tibble: 7 Ɨ 4
##   day_of_week      n  perc labels
##   <fct>        <int> <dbl> <chr> 
## 1 Tuesday     255428 0.113 11.33%
## 2 Wednesday   259168 0.115 11.49%
## 3 Monday      263845 0.117 11.70%
## 4 Thursday    283164 0.126 12.56%
## 5 Friday      326462 0.145 14.48%
## 6 Sunday      387598 0.172 17.19%
## 7 Saturday    479560 0.213 21.26%
day_member
## # A tibble: 7 Ɨ 4
##   day_of_week      n  perc labels
##   <fct>        <int> <dbl> <chr> 
## 1 Sunday      381771 0.117 11.66%
## 2 Saturday    444170 0.136 13.57%
## 3 Monday      456314 0.139 13.94%
## 4 Friday      465679 0.142 14.23%
## 5 Thursday    501276 0.153 15.32%
## 6 Wednesday   509373 0.156 15.56%
## 7 Tuesday     514428 0.157 15.72%

Bar and pie chart visualizations

3. How is the service usage distributed over a day?


Counts the number of trips per time second by second

Iā€™ve created a function called ā€œcalculate_bike_trip_countsā€. This function does the following:

  1. The if statement checks if member_type is provided. If it is, the function filters data to only include rows where the member_casual column matches member_type. If it isnā€™t provided, all rows are included.

  2. The function then groups the rows of the data data frame by started_time column and counts the number of occurrences of each unique value in the started_time column, sorted in descending order. The result is a new data frame trip_counts that shows the count of bike trips for each unique started_time value.

  3. Finally, the function returns the trip_counts data frame.

calculate_bike_trip_counts <- function(data, member_type = NULL) {
  if (!is.null(member_type)) {
    data <- data %>% filter(member_casual == member_type)
  }
  trip_counts <- data %>%
    group_by(started_time) %>%
    count(started_time, sort = TRUE)
  return(trip_counts)
}

Apply function

Tiempo_conteo <- calculate_bike_trip_counts(BikeTrips_todos, NULL)
Tiempo_conteo_casual <- calculate_bike_trip_counts(BikeTrips_todos, "casual")
Tiempo_conteo_member <- calculate_bike_trip_counts(BikeTrips_todos, "member")

Convert time to POSIXct for plotting

Iā€™ve created a function called ā€œconvert_and_adjust_timeā€. This function does the following:

  1. Takes a data frame and converts the started_time column to POSIXct format

  2. Adjusts any values less than the Unix epoch time to be one day ahead.

convert_and_adjust_time <- function(df) {
  df %>%
    mutate(started_time = as.POSIXct(started_time),
           started_time = if_else(started_time < as.POSIXct('1970-01-01 00:00:00', 'UTC'), started_time + 86400, started_time))
}

Apply function

Tiempo_conteo2 <- convert_and_adjust_time(Tiempo_conteo)
Tiempo_conteo_casual2 <- convert_and_adjust_time(Tiempo_conteo_casual)
Tiempo_conteo_member2 <- convert_and_adjust_time(Tiempo_conteo_member)

Dot plot visualization

4. What is the average trip duration?


Convert time to POSIXct for plotting

Iā€™ve created a function called ā€œcalculate_ride_lengthā€. This function does the following:

  1. Takes in a data frame data and an optional argument member_type. The if statement checks if member_type is provided. If it is, the function filters data to only include rows where the member_casual column matches member_type. If it isnā€™t provided, all rows are included.

  2. The function then modifies the ride_length column of the data data frame by converting it to POSIXct format and adding 86400 seconds (one day) if the ride_length is less than 1970-01-01 00:00:00 in UTC time. This is done using the mutate and if_else functions from dplyr.

  3. Finally, the function returns the modified data data frame.

calculate_ride_length <- function(data, member_type = NULL) {
  if (!is.null(member_type)) {
    data <- data %>% filter(member_casual == member_type)
  }
  data <- data %>% mutate(
    ride_length = as.POSIXct(ride_length),
    ride_length = if_else(ride_length < as.POSIXct('1970-01-01 00:00:00', 'UTC'), ride_length + 86400, ride_length)
  )
  return(data)
}
BikeTrips_todos_casual <- calculate_ride_length(BikeTrips_todos, "casual")

BikeTrips_todos_member <- calculate_ride_length(BikeTrips_todos, "member")

Calculate mean and median for casual and member riders

# I calculated the mean for casual riders and convert it into a numeric value
mean1 <- BikeTrips_todos_casual %>%
   summarize(mean(ride_length))

mean1_numeric <- as.numeric(mean1)
mean1
## # A tibble: 1 Ɨ 1
##   `mean(ride_length)`
##   <dttm>             
## 1 1970-01-01 00:22:51
# I calculated the median for casual riders and convert it into a numeric value
median_casual <- BikeTrips_todos_casual %>%
  summarize(median(ride_length))

median1_numeric <- as.numeric(median_casual)
median_casual
## # A tibble: 1 Ɨ 1
##   `median(ride_length)`
##   <dttm>               
## 1 1970-01-01 00:13:23
# I calculated the mean for member riders and convert it into a numeric value
mean2 <- BikeTrips_todos_member %>%
  summarize(mean(ride_length))

mean2_numeric <- as.numeric(mean2)
mean2
## # A tibble: 1 Ɨ 1
##   `mean(ride_length)`
##   <dttm>             
## 1 1970-01-01 00:12:26
# I calculated the median for casual riders and convert it into a numeric value
median_member <- BikeTrips_todos_member %>%
  summarize(median(ride_length))

median2_numeric <- as.numeric(median_member)
median_member
## # A tibble: 1 Ɨ 1
##   `median(ride_length)`
##   <dttm>               
## 1 1970-01-01 00:08:50

Create 2 vertical lines with the mean for plotting

mean1_vertical <- data.frame(x=c(mean1_numeric,mean1_numeric), y=c(-5,5))

mean2_vertical <- data.frame(x=c(mean2_numeric,mean2_numeric), y=c(-5,5))

Histogram visualizations

5. What is the preferred station for starting trips?


Top ten start stations for member and casual riders

# I printed the top ten start stations for member riders
start_station_member <- BikeTrips_todos %>%
  filter(member_casual == "member") %>%
  count(start_station_name, sort = TRUE)

start_station_member %>%
print(n=11 )
## # A tibble: 1,410 Ɨ 2
##    start_station_name                n
##    <chr>                         <int>
##  1 start_station_none_given     503428
##  2 Kingsbury St & Kinzie St      25146
##  3 Clark St & Elm St             22147
##  4 Wells St & Concord Ln         21459
##  5 Ellis Ave & 60th St           19715
##  6 University Ave & 57th St      19551
##  7 Clinton St & Washington Blvd  19022
##  8 Clinton St & Madison St       18898
##  9 Wells St & Elm St             18851
## 10 Loomis St & Lexington St      17848
## 11 Dearborn St & Erie St         17090
## # ā€¦ with 1,399 more rows
# I printed the top ten start stations for casual riders
start_station_casual <- BikeTrips_todos %>%
  filter(member_casual == "casual") %>%
  count(start_station_name, sort = TRUE) 

  start_station_casual %>%
  print(n=11 )
## # A tibble: 1,515 Ɨ 2
##    start_station_name                      n
##    <chr>                               <int>
##  1 start_station_none_given           356308
##  2 Streeter Dr & Grand Ave             54791
##  3 DuSable Lake Shore Dr & Monroe St   30394
##  4 Millennium Park                     25548
##  5 Michigan Ave & Oak St               23580
##  6 DuSable Lake Shore Dr & North Blvd  21750
##  7 Shedd Aquarium                      19405
##  8 Theater on the Lake                 17293
##  9 Wells St & Concord Ln               15832
## 10 Clark St & Armitage Ave             13450
## 11 Dusable Harbor                      13335
## # ā€¦ with 1,504 more rows

Top ten end stations for member and casual riders

 # I printed the top ten end stations for casual riders

end_station_casual <- BikeTrips_todos %>%
  filter(member_casual == "casual") %>%
  count(end_station_name, sort = TRUE) 

end_station_casual %>%
print(n = 11)
## # A tibble: 1,529 Ɨ 2
##    end_station_name                        n
##    <chr>                               <int>
##  1 end_station_none_given             418061
##  2 Streeter Dr & Grand Ave             56605
##  3 DuSable Lake Shore Dr & Monroe St   27989
##  4 Millennium Park                     26597
##  5 Michigan Ave & Oak St               24831
##  6 DuSable Lake Shore Dr & North Blvd  23965
##  7 Theater on the Lake                 18217
##  8 Shedd Aquarium                      17902
##  9 Wells St & Concord Ln               15152
## 10 Clark St & Armitage Ave             13414
## 11 Clark St & Lincoln Ave              13089
## # ā€¦ with 1,518 more rows
# I printed the top ten end stations for member riders
end_station_member <- BikeTrips_todos %>%
  filter(member_casual == "member") %>%
  count(end_station_name, sort = TRUE) 


end_station_member %>%
  print(n = 11)
## # A tibble: 1,397 Ɨ 2
##    end_station_name                  n
##    <chr>                         <int>
##  1 end_station_none_given       501666
##  2 Kingsbury St & Kinzie St      24630
##  3 Clark St & Elm St             22474
##  4 Wells St & Concord Ln         21983
##  5 University Ave & 57th St      20360
##  6 Clinton St & Washington Blvd  19671
##  7 Clinton St & Madison St       19532
##  8 Ellis Ave & 60th St           19230
##  9 Wells St & Elm St             18625
## 10 Broadway & Barry Ave          17627
## 11 Loomis St & Lexington St      17351
## # ā€¦ with 1,386 more rows

6. How are the trip start points distributed geographically?


Range of starting latitude of trips

max(BikeTrips_todos$start_lat)
## [1] 45.63503
min(BikeTrips_todos$start_lat)
## [1] 41.64

Histogram visualization of starting latitude of trips

Range of starting longitude of trips

max(BikeTrips_todos$start_lng)
## [1] -73.79648
min(BikeTrips_todos$start_lng)
## [1] -87.84

Histogram visualization of starting longitude of trips

Range of ending latitude of trips

max(BikeTrips_todos$end_lat)
## [1] 91
min(BikeTrips_todos$end_lat)
## [1] 41.39

Histogram visualization of ending latitude of trips

Range of ending longitude of trips

max(BikeTrips_todos$end_lng)
## [1] 181
min(BikeTrips_todos$end_lng)
## [1] -88.97

Histogram visualization of ending longitude of trips