library(tidyverse)
library(stringr)
library(ggpubr)
library(knitr)
If you donāt keep your data in the same directory as the code, adapt the path names.
dir1 <- "~"
dir2 <- "Desktop"
dir3 <- "AC"
dir4 <- "Useful"
dir5 <- "Carrer"
dir6 <- "Skills "
dir7 <- "3. Skills para trabajo"
dir8 <- "10. R data science, statistics, machine learning"
dir9 <- "Portfolio analysis"
dir10 <- "2. A:B test"
file_name <- "Data"
PSDS_PATH <- file.path(dir1, dir2, dir3, dir4, dir5, dir6, dir7, dir8, dir9, dir10, file_name)
AB_Test_Results <- read_csv(file.path(PSDS_PATH, 'AB_Test_Results.csv'))
str(AB_Test_Results)
## spc_tbl_ [10,000 Ć 3] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ USER_ID : num [1:10000] 737 2423 9411 7311 6174 ...
## $ VARIANT_NAME: chr [1:10000] "variant" "control" "control" "control" ...
## $ REVENUE : num [1:10000] 0 0 0 0 0 0 0 0 0 0 ...
## - attr(*, "spec")=
## .. cols(
## .. USER_ID = col_double(),
## .. VARIANT_NAME = col_character(),
## .. REVENUE = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
AB_Test_Results<- arrange(AB_Test_Results,REVENUE, USER_ID, VARIANT_NAME)
We start by plotting an histogram to observe if there were any duplicates.
We use the function unique to eliminate rows that are completely equal.
Unique_AB_Test_Results<- unique (AB_Test_Results[ , c('USER_ID','VARIANT_NAME','REVENUE') ] )
nrow(Unique_AB_Test_Results)
## [1] 7933
We are left with 7933 observations(which means that more than 20% of the data were duplicates!).
Exploring the data we found that there were users that took part in both of the tests (Example: user ID 3 was in both of the groups). This could mean that the user took the test 2 times with the 2 different variants.
Therefore, to avoid any kind of BIAS, we decided to only keep users that took only 1 of our variations.
We performed a few vectorial calculations to achieve it.
# find rows with duplicated USER_ID and VARIANT_NAME == "control", with additional condition
dup_rows1 <- duplicated(Unique_AB_Test_Results$USER_ID) & Unique_AB_Test_Results$VARIANT_NAME == "control" &
Unique_AB_Test_Results$USER_ID %in% Unique_AB_Test_Results$USER_ID[Unique_AB_Test_Results$VARIANT_NAME == "variant"]
# mark the first occurrence of each duplicate USER_ID as TRUE
dup_rows1[match(unique(Unique_AB_Test_Results$USER_ID[dup_rows1]), Unique_AB_Test_Results$USER_ID)] <- TRUE
# print the result
print(dup_rows1)
num_true1 <- sum(dup_rows1)
print(num_true1)
# find rows with duplicated USER_ID and VARIANT_NAME == "variant", with additional condition
dup_rows2 <- duplicated(Unique_AB_Test_Results$USER_ID) & Unique_AB_Test_Results$VARIANT_NAME == "variant" &
Unique_AB_Test_Results$USER_ID %in% Unique_AB_Test_Results$USER_ID[Unique_AB_Test_Results$VARIANT_NAME == "control"]
# mark the first occurrence of each duplicate USER_ID as TRUE
dup_rows2[match(unique(Unique_AB_Test_Results$USER_ID[dup_rows2]), Unique_AB_Test_Results$USER_ID)] <- TRUE
# print the result
print(dup_rows2)
num_true2 <- sum(dup_rows2)
print(num_true2)
# Perform an OR
dup_rows3 <- dup_rows1 | dup_rows2
# Remove the duplicated rows
One_time_AB_Test_Results <- Unique_AB_Test_Results[!dup_rows3,]
We ended up with 4821 observations.
Now, there are still duplicates, but they took the same test, we decided that it makes sense to keep only the observations that bring the highest revenue.
Max_Revenue_One_time_AB_Test_Results <- One_time_AB_Test_Results %>%
group_by(USER_ID) %>%
slice(which.max(REVENUE)) %>%
ungroup()
We ended up with 4783 observations.
Then we explore with an histogram again:
Our dataframe finally has no duplicates.
max(nchar(Max_Revenue_One_time_AB_Test_Results$VARIANT_NAME))
## [1] 7
min(nchar(Max_Revenue_One_time_AB_Test_Results$VARIANT_NAME))
## [1] 7
The only 2 possibilities are ācontrolā and āvariantā, both of them have 7 characters, therefore, We can conclude that there are no blanks spaces.
max(Max_Revenue_One_time_AB_Test_Results$REVENUE)
## [1] 196.01
min(Max_Revenue_One_time_AB_Test_Results$REVENUE)
## [1] 0
There are not non expected values (like negative revenues).
We decided to filter by variant name so that we could compare both groups in our analysis.
Control_group_cleaned <- Max_Revenue_One_time_AB_Test_Results %>%
group_by(USER_ID) %>%
filter(VARIANT_NAME == 'control') %>%
ungroup()
Variant_group_cleaned <- Max_Revenue_One_time_AB_Test_Results %>%
group_by(USER_ID) %>%
filter(VARIANT_NAME == 'variant') %>%
ungroup()
We start with a boxplot comparing both variants.
An outlier can be seen in the boxplot, we checked it with the following code:
Max_Revenue_One_time_AB_Test_Results %>%
group_by(USER_ID) %>%
arrange(desc(REVENUE)) %>%
head()
## # A tibble: 6 Ć 3
## # Groups: USER_ID [6]
## USER_ID VARIANT_NAME REVENUE
## <dbl> <chr> <dbl>
## 1 3342 control 196.
## 2 2166 control 29.3
## 3 7356 variant 23.0
## 4 1053 control 20.1
## 5 3684 variant 19.5
## 6 282 control 18.6
As it is just one observation of 4745, We decided to exclude it.
Filtered_Max_Revenue_One_time_AB_Test_Results <- Max_Revenue_One_time_AB_Test_Results %>%
filter(REVENUE != max(REVENUE))
Now We plot the boxplot again but without the outlier.
It shows clearly how the data is distributed, but We thought it would also be benefitial to explore only the observations that bring revenue (revenue >0). So we start by performing a filter.
Non_Empty_Filtered_AB_Test_Results <- Filtered_Max_Revenue_One_time_AB_Test_Results %>%
filter(REVENUE > 0)
No_max_Control_group_cleaned <- Control_group_cleaned %>%
filter(REVENUE != max(REVENUE))
Non_Empty_Control_group_cleaned <- Control_group_cleaned %>%
filter(REVENUE > 0) %>%
filter(REVENUE != max(REVENUE))
Non_Empty_Variant_group_cleaned <- Variant_group_cleaned %>%
filter(REVENUE > 0)
Then, we plot it.
In this boxplot more information about the two groups can be observed:
Control group statistics (Interquartile, median, range) are higher than the ones from the variant group.
Control group has a higher count of values above the interquartile range.
Also, I decided to plot some histograms to explore further how the revenue is distributed. In this histogram we also learned more about how the revenue is distributed among both of the groups. An important observation is that most of the users brought less than $5.
It can be seen from the histogram that it is not normally distributed, but we decided to make a test to confirm it.
#We first need to standarized the REVENUE of the control group and then store it as a vector
Standarized_Revenue_Control_group<- scale(Non_Empty_Control_group_cleaned$REVENUE)
Standarized_Revenue_Variant_group<- scale(Non_Empty_Variant_group_cleaned$REVENUE)
If the values were close to the line then we could conclude that it is normally distributed, but this is not the case.
control_mean <- mean(No_max_Control_group_cleaned$REVENUE)
control_mean
mean(No_max_Control_group_cleaned$REVENUE, trim=0.1)
median(No_max_Control_group_cleaned$REVENUE)
variant_mean <- mean(Variant_group_cleaned$REVENUE)
variant_mean
mean(Variant_group_cleaned$REVENUE, trim=0.1)
median(Variant_group_cleaned$REVENUE)
mean(Non_Empty_Control_group_cleaned$REVENUE)
mean(Non_Empty_Control_group_cleaned$REVENUE, trim=0.1)
median(Non_Empty_Control_group_cleaned$REVENUE)
mean(Non_Empty_Variant_group_cleaned$REVENUE)
mean(Non_Empty_Variant_group_cleaned$REVENUE, trim=0.1)
median(Non_Empty_Variant_group_cleaned$REVENUE)
Group | Mean | Trimmed_Mean | Median |
---|---|---|---|
Control | 0.1149226 | 0.000000 | 0.000 |
Variant | 0.0744129 | 0.000000 | 0.000 |
Control, Revenue > 0 | 5.1801887 | 4.058140 | 2.990 |
Variant, Revenue >0 | 4.2397619 | 3.204706 | 2.835 |
range(No_max_Control_group_cleaned$REVENUE)
sd(No_max_Control_group_cleaned$REVENUE)
IQR(No_max_Control_group_cleaned$REVENUE)
mad(No_max_Control_group_cleaned$REVENUE)
range(Variant_group_cleaned$REVENUE)
sd(Variant_group_cleaned$REVENUE)
IQR(Variant_group_cleaned$REVENUE)
mad(Variant_group_cleaned$REVENUE)
range(Non_Empty_Control_group_cleaned$REVENUE)
sd(Non_Empty_Control_group_cleaned$REVENUE)
IQR(Non_Empty_Control_group_cleaned$REVENUE)
mad(Non_Empty_Control_group_cleaned$REVENUE)
range(Non_Empty_Variant_group_cleaned$REVENUE)
sd(Non_Empty_Variant_group_cleaned$REVENUE)
IQR(Non_Empty_Variant_group_cleaned$REVENUE)
mad(Non_Empty_Variant_group_cleaned$REVENUE)
Group | Range | Standard_Deviation | Interquartile_range | Median_absolute_deviation |
---|---|---|---|---|
Control | 0-29.32 | 1.1624946 | 0.0000 | 0.000000 |
Variant | 0-23.04 | 0.8565554 | 0.0000 | 0.000000 |
Control, Revenue > 0 | 0.02-29.32 | 5.9427708 | 3.4200 | 2.579724 |
Variant, Revenue >0 | 0.02-23.04 | 4.9712738 | 3.0075 | 2.312856 |
quantile(No_max_Control_group_cleaned$REVENUE, p=c(.05, .25, .5, .75, .95, .99))
## 5% 25% 50% 75% 95% 99%
## 0.0000 0.0000 0.0000 0.0000 0.0000 3.2896
quantile(Variant_group_cleaned$REVENUE, p=c(.05, .25, .5, .75, .95, .99))
## 5% 25% 50% 75% 95% 99%
## 0.000 0.000 0.000 0.000 0.000 2.178
quantile(Non_Empty_Control_group_cleaned$REVENUE, p=c(.05, .25, .5, .75, .95, .99))
## 5% 25% 50% 75% 95% 99%
## 0.154 1.560 2.990 4.980 17.654 24.510
quantile(Non_Empty_Variant_group_cleaned$REVENUE, p=c(.05, .25, .5, .75, .95, .99))
## 5% 25% 50% 75% 95% 99%
## 0.0820 1.2625 2.8350 4.2700 12.9900 21.5763
In order to estimate the standard error we decided to use the bootstrap method.
library(boot)
stat_fun_control <- function(x, idx) mean(x[idx])
boot_obj_control <- boot(No_max_Control_group_cleaned$REVENUE, R=1000, statistic=stat_fun_control)
boot_obj_control
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = No_max_Control_group_cleaned$REVENUE, statistic = stat_fun_control,
## R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.1149226 0.0005581247 0.02263854
It can be observed that the original mean is 0.1149226 and the estimate for the standard error is 0.02295401. Also there is a very low bias which means that we can have very high confidence in the estimate of the standard error.
stat_fun_variant <- function(x, idx) mean(x[idx])
boot_obj_variant <- boot(Variant_group_cleaned$REVENUE, R=1000, statistic=stat_fun_variant)
boot_obj_variant
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Variant_group_cleaned$REVENUE, statistic = stat_fun_variant,
## R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 0.07441287 -0.0002333389 0.01713641
It can be observed that the original mean is 0.07441287 and the estimate for the standard error is 0.01751985. Also there is a very low bias which means that we can have very high confidence in the estimate of the standard error.
boot_ci_control <- boot.ci(boot_obj_control, conf=0.95, type='basic')
X_control <- data.frame(mean=boot_obj_control$t)
ci90_control <- boot_ci_control$basic[4:5]
ci_control <- data.frame(ci_control=ci90_control, y=c(8, 12))
ci_control
## ci_control y
## 1 0.06728833 8
## 2 0.15559826 12
boot_ci_variant <- boot.ci(boot_obj_variant, conf=0.95, type='basic')
X_variant <- data.frame(mean=boot_obj_variant$t)
ci90_variant <- boot_ci_variant$basic[4:5]
ci_variant <- data.frame(ci_variant=ci90_variant, y=c(8, 12))
ci_variant
## ci_variant y
## 1 0.03723699 8
## 2 0.10486313 12
Then we plot the density curves of both groups to see if they overlap:
ggplot() +
stat_density(data = X_control, aes(x = mean, group = 1, fill = "Control"), alpha = 0.5) +
stat_density(data = X_variant, aes(x = mean, group = 1, fill = "Variant"), alpha = 0.5) +
geom_vline(data = X_control, aes(xintercept = control_mean), linetype = 2, color = "#F8766D") +
geom_vline(data = X_variant, aes(xintercept = variant_mean), linetype = 2, color = "#00BFC4") +
theme_bw() +
labs(title = "Density curve of both groups", x = "Mean samples", y = "Density", fill = "") +
scale_fill_manual(values = c("#F8766D", "#00BFC4"), labels = c("Control", "Variant"))
As it is noticed, their confidence intervals do overlap, therefore, no conclusions can be made about wether the control group bring more revenue.
What we can test to determine if the control group brings more revenue than the variant group is to find out if the difference between the means is statistical significant.
mean_diff <- control_mean-variant_mean
mean_diff
## [1] 0.04050969
The question is whether this difference is within the range of what random chance might produce.
One way to answer this is to apply a permutation test, combine all the revenues together and then repeatedly shuffle and divide them into groups of 2389 (recall tha n=2389 for control group) and a group of 2393 (recall tha n=2393 for variant group).
set.seed(1)
perm_diffs <- rep(0, 1000)
for (i in 1:1000) {
perm_diffs[i] = perm_fun(Filtered_Max_Revenue_One_time_AB_Test_Results$REVENUE, 2389, 2393)
}
par(mar=c(4,4,1,0)+.1)
hist(perm_diffs, xlab='Revenue brought differences (USD)', main='')
abline(v=control_mean - variant_mean, lty=2, lwd=1.5)
text(' Observed\n difference', x=control_mean - variant_mean, y=par()$usr[4]-20, adj=0)
mean(perm_diffs > (control_mean - variant_mean))
## [1] 0.082
The histogram, shows that mean difference of random permutations often exceeds the observed difference in revenue brought (the vertical line). For our results, this happens in 8.2% of the cases.
As p-value chosen was 5%, the results suggest that the observed difference in revenue generated between the control group and variation group is well within the range of chance variation and thus is not statistically significant.