library(tidyverse)
library(stringr)
library(ggpubr)
If you donāt keep your data in the same directory as the code, adapt the path names.
dir1 <- "~"
dir2 <- "Desktop"
dir3 <- "AC"
dir4 <- "Useful"
dir5 <- "Carrer"
dir6 <- "Cursos"
dir7 <- "2. Data analysis-Google"
dir8 <- "Capstone Case study 1"
dir9 <- "How I solved it"
dir10 <- "Data - cleaning - sorting - filtering"
file_name <- "Version 3"
PSDS_PATH <- file.path(dir1, dir2, dir3, dir4, dir5, dir6, dir7, dir8,dir9,dir10, file_name)
BikeTrips_2021_11_04_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2021_11_04_v31.csv'))
BikeTrips_2021_11_04_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2021_11_04_v32.csv'))
BikeTrips_2021_12_08_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2021_12_08_v3.csv'))
BikeTrips_2022_01_06_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_01_06_v3.csv'))
BikeTrips_2022_02_02_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_02_02_v3.csv'))
BikeTrips_2022_03_02_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_03_02_v3.csv'))
BikeTrips_2022_04_06_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_04_06_v3.csv'))
BikeTrips_2022_05_03_v3 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_05_03_v3.csv'))
BikeTrips_2022_06_03_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_06_03_v31.csv'))
BikeTrips_2022_06_03_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_06_03_v32.csv'))
BikeTrips_2022_07_15_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_07_15_v31.csv'))
BikeTrips_2022_07_15_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_07_15_v32.csv'))
BikeTrips_2022_08_05_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_08_05_v31.csv'))
BikeTrips_2022_08_05_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_08_05_v32.csv'))
BikeTrips_2022_09_08_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_09_08_v31.csv'))
BikeTrips_2022_09_08_v32 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_09_08_v32.csv'))
BikeTrips_2022_10_11_v31 <- read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_10_11_v31.csv'))
BikeTrips_2022_10_11_v32 <-read_csv(file.path(PSDS_PATH, 'BikeTrips_2022_10_11_v32.csv'))
To ensure that I could merge the data frames together later, I checked the column names and corrected any spelling mistakes as needed to make them consistent across all the data frames.
colnames(BikeTrips_2021_12_08_v3)[4] <- "started_time"
colnames(BikeTrips_2022_02_02_v3)[4] <- "started_time"
colnames(BikeTrips_2022_02_02_v3)[5] <- "end_date"
colnames(BikeTrips_2022_02_02_v3)[6] <- "end_time"
BikeTrips_todos <- rbind(BikeTrips_2021_11_04_v31, BikeTrips_2021_11_04_v32, BikeTrips_2021_12_08_v3, BikeTrips_2022_01_06_v3, BikeTrips_2022_02_02_v3, BikeTrips_2022_03_02_v3, BikeTrips_2022_04_06_v3, BikeTrips_2022_05_03_v3, BikeTrips_2022_06_03_v31, BikeTrips_2022_06_03_v32, BikeTrips_2022_07_15_v31, BikeTrips_2022_07_15_v32, BikeTrips_2022_08_05_v31, BikeTrips_2022_08_05_v32, BikeTrips_2022_09_08_v31, BikeTrips_2022_09_08_v32, BikeTrips_2022_10_11_v31, BikeTrips_2022_10_11_v32)
class(BikeTrips_todos$started_date)
## [1] "character"
BikeTrips_todos$started_date <- mdy(BikeTrips_todos$started_date)
class(BikeTrips_todos$started_date)
## [1] "Date"
class(BikeTrips_todos$end_date)
## [1] "character"
BikeTrips_todos$end_date <- mdy(BikeTrips_todos$end_date)
class(BikeTrips_todos$end_date)
## [1] "Date"
BikeTrips_todos<- arrange(BikeTrips_todos,started_date,started_time)
Iāve used the function duplicated and saves the duplicates in a dataframe called duplicates (The Iāds where duplicated but the data was different)
duplicates <-BikeTrips_todos[duplicated(BikeTrips_todos$ride_id)|duplicated(BikeTrips_todos$ride_id, fromLast=TRUE),]
Therefore, I concluded that there were no duplicates.
Iāve check max and min length of CHR datatypes to find if there were any blanks (below some of them)
max(nchar(BikeTrips_todos$rideable_type))
## [1] 13
min(nchar(BikeTrips_todos$rideable_type))
## [1] 11
The only 3 options are āclassic_bikeā, āelectric_bikeā and ādocked_bikeā which have 12, 13 and 11 characters respectively, therefore, I concluded that there were no blanks.
Iāve check max and min values of NUM datatypes to find if there were any blanks (below some of them)
max(BikeTrips_todos$start_lng)
## [1] -73.79648
min(BikeTrips_todos$start_lng)
## [1] -87.84
Longitudes could range from -180 to 0 (West) or 0 to 180 (east). Therefore, the longitudes are inside the range. Altough, Chicago longides are between -87.63 to -87.52 degrees West, which means that there are some values out of that range, we will have to decide what to do with those measure later on.
Iāve used this function which is similar to countIF to know how many observations had more than 8 characters
sum(nchar(BikeTrips_todos$rideable_type) > 8, na.rm=TRUE)
## [1] 5528236
All observations must fall within this range, as the āBikeTrips_todosā data frame has 5528236 observations, we conclude that the ācountā is correct.
Iāve used with this function which is similar to countIF to know how many observations had latitudes above 8Āŗ.
sum(BikeTrips_todos$start_lat > 8, na.rm=TRUE)
## [1] 5528236
All observations must fall within this range, as the Chicagoās latitude range is between 41.6ĀŗN and 42ĀŗN, the data frame has 5528236 observations, we conclude that the ācountā is correct.
Iāve use this function to erase all blank spaces before and after all characters
BikeTrips_todos <- BikeTrips_todos %>% mutate(across(where(is.character), str_trim))
Iāve created a function called conteo. This function does the following:
Filters the āBikeTrips_todosā data frame by casual or member.
Count the rides per type of ride (classic, electric, docked).
Creates a new column with the % calculations.
Scale the percentages in a new column called ālablesā as characters to plot later on.
conteo<- function(casual_or_member) {BikeTrips_todos %>%
filter(member_casual == casual_or_member) %>%
group_by(rideable_type) %>% # Variable to be transformed
count(rideable_type) %>%
ungroup() %>%
mutate(perc = n / sum(n)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc)) }
conteoA <- conteo("casual")
conteoB <- conteo("member")
conteoA
## # A tibble: 3 Ć 4
## rideable_type n perc labels
## <chr> <int> <dbl> <chr>
## 1 docked_bike 180575 0.0801 8%
## 2 classic_bike 872265 0.387 39%
## 3 electric_bike 1202385 0.533 53%
conteoB
## # A tibble: 2 Ć 4
## rideable_type n perc labels
## <chr> <int> <dbl> <chr>
## 1 electric_bike 1565095 0.478 47.8%
## 2 classic_bike 1707916 0.522 52.2%
Default is for Sunday to be the first day on R, I used the function factor to order the āday_of_weekā column
BikeTrips_todos_ordenado_por_dia<-BikeTrips_todos
BikeTrips_todos_ordenado_por_dia$day_of_week <- factor(BikeTrips_todos_ordenado_por_dia$day_of_week, c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"))
Iāve created a function called day. This function does the following:
Filters the āBikeTrips_todos_ordenado_por_diaā data frame by casual or member.
Sort and count the rides per day of the week (monday, tuesday, etc).
Creates a new column with the % calculations.
Scale the percentages in a new column called ālabelsā as characters to plot later on.
day <- function(casual_or_member) {BikeTrips_todos_ordenado_por_dia %>%
filter(member_casual == casual_or_member) %>%
group_by(day_of_week) %>% # Variable to be transformed
count(day_of_week) %>%
ungroup() %>%
mutate(perc = n / sum(n)) %>%
arrange(perc) %>%
mutate(labels = scales::percent(perc)) }
day_casual <- day("casual")
day_member <- day("member")
day_casual
## # A tibble: 7 Ć 4
## day_of_week n perc labels
## <fct> <int> <dbl> <chr>
## 1 Tuesday 255428 0.113 11.33%
## 2 Wednesday 259168 0.115 11.49%
## 3 Monday 263845 0.117 11.70%
## 4 Thursday 283164 0.126 12.56%
## 5 Friday 326462 0.145 14.48%
## 6 Sunday 387598 0.172 17.19%
## 7 Saturday 479560 0.213 21.26%
day_member
## # A tibble: 7 Ć 4
## day_of_week n perc labels
## <fct> <int> <dbl> <chr>
## 1 Sunday 381771 0.117 11.66%
## 2 Saturday 444170 0.136 13.57%
## 3 Monday 456314 0.139 13.94%
## 4 Friday 465679 0.142 14.23%
## 5 Thursday 501276 0.153 15.32%
## 6 Wednesday 509373 0.156 15.56%
## 7 Tuesday 514428 0.157 15.72%
Iāve created a function called ācalculate_bike_trip_countsā. This function does the following:
The if statement checks if member_type is provided. If it is, the function filters data to only include rows where the member_casual column matches member_type. If it isnāt provided, all rows are included.
The function then groups the rows of the data data frame by started_time column and counts the number of occurrences of each unique value in the started_time column, sorted in descending order. The result is a new data frame trip_counts that shows the count of bike trips for each unique started_time value.
Finally, the function returns the trip_counts data frame.
calculate_bike_trip_counts <- function(data, member_type = NULL) {
if (!is.null(member_type)) {
data <- data %>% filter(member_casual == member_type)
}
trip_counts <- data %>%
group_by(started_time) %>%
count(started_time, sort = TRUE)
return(trip_counts)
}
Tiempo_conteo <- calculate_bike_trip_counts(BikeTrips_todos, NULL)
Tiempo_conteo_casual <- calculate_bike_trip_counts(BikeTrips_todos, "casual")
Tiempo_conteo_member <- calculate_bike_trip_counts(BikeTrips_todos, "member")
Iāve created a function called āconvert_and_adjust_timeā. This function does the following:
Takes a data frame and converts the started_time column to POSIXct format
Adjusts any values less than the Unix epoch time to be one day ahead.
convert_and_adjust_time <- function(df) {
df %>%
mutate(started_time = as.POSIXct(started_time),
started_time = if_else(started_time < as.POSIXct('1970-01-01 00:00:00', 'UTC'), started_time + 86400, started_time))
}
Tiempo_conteo2 <- convert_and_adjust_time(Tiempo_conteo)
Tiempo_conteo_casual2 <- convert_and_adjust_time(Tiempo_conteo_casual)
Tiempo_conteo_member2 <- convert_and_adjust_time(Tiempo_conteo_member)
Iāve created a function called ācalculate_ride_lengthā. This function does the following:
Takes in a data frame data and an optional argument member_type. The if statement checks if member_type is provided. If it is, the function filters data to only include rows where the member_casual column matches member_type. If it isnāt provided, all rows are included.
The function then modifies the ride_length column of the data data frame by converting it to POSIXct format and adding 86400 seconds (one day) if the ride_length is less than 1970-01-01 00:00:00 in UTC time. This is done using the mutate and if_else functions from dplyr.
Finally, the function returns the modified data data frame.
calculate_ride_length <- function(data, member_type = NULL) {
if (!is.null(member_type)) {
data <- data %>% filter(member_casual == member_type)
}
data <- data %>% mutate(
ride_length = as.POSIXct(ride_length),
ride_length = if_else(ride_length < as.POSIXct('1970-01-01 00:00:00', 'UTC'), ride_length + 86400, ride_length)
)
return(data)
}
BikeTrips_todos_casual <- calculate_ride_length(BikeTrips_todos, "casual")
BikeTrips_todos_member <- calculate_ride_length(BikeTrips_todos, "member")
# I calculated the mean for casual riders and convert it into a numeric value
mean1 <- BikeTrips_todos_casual %>%
summarize(mean(ride_length))
mean1_numeric <- as.numeric(mean1)
mean1
## # A tibble: 1 Ć 1
## `mean(ride_length)`
## <dttm>
## 1 1970-01-01 00:22:51
# I calculated the median for casual riders and convert it into a numeric value
median_casual <- BikeTrips_todos_casual %>%
summarize(median(ride_length))
median1_numeric <- as.numeric(median_casual)
median_casual
## # A tibble: 1 Ć 1
## `median(ride_length)`
## <dttm>
## 1 1970-01-01 00:13:23
# I calculated the mean for member riders and convert it into a numeric value
mean2 <- BikeTrips_todos_member %>%
summarize(mean(ride_length))
mean2_numeric <- as.numeric(mean2)
mean2
## # A tibble: 1 Ć 1
## `mean(ride_length)`
## <dttm>
## 1 1970-01-01 00:12:26
# I calculated the median for casual riders and convert it into a numeric value
median_member <- BikeTrips_todos_member %>%
summarize(median(ride_length))
median2_numeric <- as.numeric(median_member)
median_member
## # A tibble: 1 Ć 1
## `median(ride_length)`
## <dttm>
## 1 1970-01-01 00:08:50
mean1_vertical <- data.frame(x=c(mean1_numeric,mean1_numeric), y=c(-5,5))
mean2_vertical <- data.frame(x=c(mean2_numeric,mean2_numeric), y=c(-5,5))
# I printed the top ten start stations for member riders
start_station_member <- BikeTrips_todos %>%
filter(member_casual == "member") %>%
count(start_station_name, sort = TRUE)
start_station_member %>%
print(n=11 )
## # A tibble: 1,410 Ć 2
## start_station_name n
## <chr> <int>
## 1 start_station_none_given 503428
## 2 Kingsbury St & Kinzie St 25146
## 3 Clark St & Elm St 22147
## 4 Wells St & Concord Ln 21459
## 5 Ellis Ave & 60th St 19715
## 6 University Ave & 57th St 19551
## 7 Clinton St & Washington Blvd 19022
## 8 Clinton St & Madison St 18898
## 9 Wells St & Elm St 18851
## 10 Loomis St & Lexington St 17848
## 11 Dearborn St & Erie St 17090
## # ā¦ with 1,399 more rows
# I printed the top ten start stations for casual riders
start_station_casual <- BikeTrips_todos %>%
filter(member_casual == "casual") %>%
count(start_station_name, sort = TRUE)
start_station_casual %>%
print(n=11 )
## # A tibble: 1,515 Ć 2
## start_station_name n
## <chr> <int>
## 1 start_station_none_given 356308
## 2 Streeter Dr & Grand Ave 54791
## 3 DuSable Lake Shore Dr & Monroe St 30394
## 4 Millennium Park 25548
## 5 Michigan Ave & Oak St 23580
## 6 DuSable Lake Shore Dr & North Blvd 21750
## 7 Shedd Aquarium 19405
## 8 Theater on the Lake 17293
## 9 Wells St & Concord Ln 15832
## 10 Clark St & Armitage Ave 13450
## 11 Dusable Harbor 13335
## # ā¦ with 1,504 more rows
# I printed the top ten end stations for casual riders
end_station_casual <- BikeTrips_todos %>%
filter(member_casual == "casual") %>%
count(end_station_name, sort = TRUE)
end_station_casual %>%
print(n = 11)
## # A tibble: 1,529 Ć 2
## end_station_name n
## <chr> <int>
## 1 end_station_none_given 418061
## 2 Streeter Dr & Grand Ave 56605
## 3 DuSable Lake Shore Dr & Monroe St 27989
## 4 Millennium Park 26597
## 5 Michigan Ave & Oak St 24831
## 6 DuSable Lake Shore Dr & North Blvd 23965
## 7 Theater on the Lake 18217
## 8 Shedd Aquarium 17902
## 9 Wells St & Concord Ln 15152
## 10 Clark St & Armitage Ave 13414
## 11 Clark St & Lincoln Ave 13089
## # ā¦ with 1,518 more rows
# I printed the top ten end stations for member riders
end_station_member <- BikeTrips_todos %>%
filter(member_casual == "member") %>%
count(end_station_name, sort = TRUE)
end_station_member %>%
print(n = 11)
## # A tibble: 1,397 Ć 2
## end_station_name n
## <chr> <int>
## 1 end_station_none_given 501666
## 2 Kingsbury St & Kinzie St 24630
## 3 Clark St & Elm St 22474
## 4 Wells St & Concord Ln 21983
## 5 University Ave & 57th St 20360
## 6 Clinton St & Washington Blvd 19671
## 7 Clinton St & Madison St 19532
## 8 Ellis Ave & 60th St 19230
## 9 Wells St & Elm St 18625
## 10 Broadway & Barry Ave 17627
## 11 Loomis St & Lexington St 17351
## # ā¦ with 1,386 more rows
max(BikeTrips_todos$start_lat)
## [1] 45.63503
min(BikeTrips_todos$start_lat)
## [1] 41.64
max(BikeTrips_todos$start_lng)
## [1] -73.79648
min(BikeTrips_todos$start_lng)
## [1] -87.84
max(BikeTrips_todos$end_lat)
## [1] 91
min(BikeTrips_todos$end_lat)
## [1] 41.39
max(BikeTrips_todos$end_lng)
## [1] 181
min(BikeTrips_todos$end_lng)
## [1] -88.97