#Library
knitr::opts_chunk$set(echo = TRUE)
library(dplyr)
library(readr)
library(effectsize)
library(pwr)
library(ggplot2)

Section one : Exploratory data analysis (EDA)

Data Quality Check

setwd("/Users/manasa/Documents/Warwick/Term 2/ADA/Group Assignment")
getwd()
## [1] "/Users/manasa/Documents/Warwick/Term 2/ADA/Group Assignment"
# Read the data
LoanData <- read_csv("project_data.csv")
## Rows: 470 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Variant, loanofficer_id
## dbl (20): day, typeI_init, typeI_fin, typeII_init, typeII_fin, agree_init, a...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Check the class of each column

#print a summary of the structure of LoanData
str(LoanData)
## spc_tbl_ [470 × 22] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Variant              : chr [1:470] "Treatment" "Treatment" "Treatment" "Treatment" ...
##  $ loanofficer_id       : chr [1:470] "qamcqdoe" "qamcqdoe" "qamcqdoe" "qamcqdoe" ...
##  $ day                  : num [1:470] 1 2 3 4 5 6 7 8 9 10 ...
##  $ typeI_init           : num [1:470] 0 2 3 1 0 0 0 0 0 0 ...
##  $ typeI_fin            : num [1:470] 0 2 3 2 2 1 1 3 1 0 ...
##  $ typeII_init          : num [1:470] 2 3 0 1 0 4 1 4 4 2 ...
##  $ typeII_fin           : num [1:470] 2 3 0 1 0 0 0 1 1 1 ...
##  $ agree_init           : num [1:470] 7 8 9 8 8 5 8 4 6 9 ...
##  $ agree_fin            : num [1:470] 10 8 9 9 10 10 10 10 10 10 ...
##  $ conflict_init        : num [1:470] 2 2 1 2 2 5 2 6 4 1 ...
##  $ conflict_fin         : num [1:470] 0 2 1 1 0 0 0 0 0 0 ...
##  $ revised_per_ai       : num [1:470] 2 0 0 1 2 5 2 6 4 1 ...
##  $ revised_agst_ai      : num [1:470] 0 0 0 0 0 0 0 0 0 0 ...
##  $ fully_complt         : num [1:470] 9 10 10 10 10 10 10 10 10 10 ...
##  $ confidence_init_total: num [1:470] 706 911 710 694 683 743 993 1000 1000 1000 ...
##  $ confidence_fin_total : num [1:470] 913 974 970 961 1000 1000 1000 1000 1000 1000 ...
##  $ complt_init          : num [1:470] 9 10 10 10 10 10 10 10 10 10 ...
##  $ complt_fin           : num [1:470] 10 10 10 10 10 10 10 10 10 10 ...
##  $ ai_typeI             : num [1:470] 0 1 2 1 2 1 1 3 1 0 ...
##  $ ai_typeII            : num [1:470] 2 2 0 1 0 0 0 1 1 1 ...
##  $ badloans_num         : num [1:470] 4 5 2 3 0 4 1 4 4 3 ...
##  $ goodloans_num        : num [1:470] 6 5 8 7 10 6 9 6 6 7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Variant = col_character(),
##   ..   loanofficer_id = col_character(),
##   ..   day = col_double(),
##   ..   typeI_init = col_double(),
##   ..   typeI_fin = col_double(),
##   ..   typeII_init = col_double(),
##   ..   typeII_fin = col_double(),
##   ..   agree_init = col_double(),
##   ..   agree_fin = col_double(),
##   ..   conflict_init = col_double(),
##   ..   conflict_fin = col_double(),
##   ..   revised_per_ai = col_double(),
##   ..   revised_agst_ai = col_double(),
##   ..   fully_complt = col_double(),
##   ..   confidence_init_total = col_double(),
##   ..   confidence_fin_total = col_double(),
##   ..   complt_init = col_double(),
##   ..   complt_fin = col_double(),
##   ..   ai_typeI = col_double(),
##   ..   ai_typeII = col_double(),
##   ..   badloans_num = col_double(),
##   ..   goodloans_num = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Result: All columns have the appropriate data type/class.

Outliers

Use summary function to see the potential Outliers

summary(LoanData)
##    Variant          loanofficer_id          day         typeI_init    
##  Length:470         Length:470         Min.   : 1.0   Min.   : 0.000  
##  Class :character   Class :character   1st Qu.: 3.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 5.5   Median : 2.000  
##                                        Mean   : 5.5   Mean   : 2.619  
##                                        3rd Qu.: 8.0   3rd Qu.: 4.000  
##                                        Max.   :10.0   Max.   :10.000  
##    typeI_fin      typeII_init      typeII_fin       agree_init   
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.: 2.25  
##  Median :2.000   Median :1.000   Median :0.0000   Median : 7.00  
##  Mean   :1.904   Mean   :1.136   Mean   :0.7298   Mean   : 5.64  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 8.00  
##  Max.   :8.000   Max.   :5.000   Max.   :3.0000   Max.   :10.00  
##    agree_fin      conflict_init    conflict_fin   revised_per_ai  
##  Min.   : 0.000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 5.000   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median : 8.000   Median :2.000   Median :1.000   Median :0.0000  
##  Mean   : 6.602   Mean   :1.938   Mean   :1.253   Mean   :0.8149  
##  3rd Qu.: 9.000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :10.000   Max.   :8.000   Max.   :8.000   Max.   :8.0000  
##  revised_agst_ai    fully_complt    confidence_init_total confidence_fin_total
##  Min.   :0.00000   Min.   : 0.000   Min.   :  50.0        Min.   :   0.0      
##  1st Qu.:0.00000   1st Qu.: 6.000   1st Qu.: 485.5        1st Qu.: 297.2      
##  Median :0.00000   Median :10.000   Median : 654.0        Median : 649.5      
##  Mean   :0.08511   Mean   : 7.579   Mean   : 624.7        Mean   : 561.9      
##  3rd Qu.:0.00000   3rd Qu.:10.000   3rd Qu.: 770.5        3rd Qu.: 810.8      
##  Max.   :4.00000   Max.   :10.000   Max.   :1000.0        Max.   :1000.0      
##   complt_init      complt_fin        ai_typeI       ai_typeII      badloans_num
##  Min.   : 1.00   Min.   : 0.000   Min.   :0.000   Min.   :0.000   Min.   :0    
##  1st Qu.:10.00   1st Qu.: 9.000   1st Qu.:1.000   1st Qu.:0.000   1st Qu.:2    
##  Median :10.00   Median :10.000   Median :2.000   Median :1.000   Median :3    
##  Mean   : 9.47   Mean   : 7.855   Mean   :1.685   Mean   :1.123   Mean   :3    
##  3rd Qu.:10.00   3rd Qu.:10.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:4    
##  Max.   :10.00   Max.   :10.000   Max.   :5.000   Max.   :3.000   Max.   :5    
##  goodloans_num
##  Min.   : 5   
##  1st Qu.: 6   
##  Median : 7   
##  Mean   : 7   
##  3rd Qu.: 8   
##  Max.   :10

Result : After checking the minimum, maximum, and mean values, no potential outliers were identified. The minimum and maximum values of each column fall within the expected range.

Plot histogram to see potential outliers

library(ggplot2)
library(dplyr)
library(tidyr)

# List of columns to plot
columns_to_plot <- c("day", "typeI_init", "typeI_fin", "typeII_init", "typeII_fin", 
                     "agree_init", "agree_fin", "conflict_init", "conflict_fin", 
                     "revised_per_ai", "revised_agst_ai", "fully_complt", 
                     "confidence_init_total", "confidence_fin_total", 
                     "complt_init", "complt_fin", "ai_typeI", "ai_typeII", 
                     "badloans_num", "goodloans_num")

# Convert data to long format
LoanData_long <- LoanData %>%
  select(all_of(columns_to_plot)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value")

# Plot histograms using facet_wrap
ggplot(LoanData_long, aes(x = Value)) +
  geom_histogram(binwidth = 1, fill = "skyblue", color = "black", alpha = 0.7) +
  facet_wrap(~ Variable, scales = "free") +  # Free scales for better visualization
  labs(title = "Histograms of Selected Columns", x = "Value", y = "Frequency") +
  theme_minimal()

Result : No potential outliers were identified, even though some values contain a high frequency of 0 and 10. This is expected due to the nature of the data

N/A values

Count total number of N/A values

na_counts <- colSums(is.na(LoanData))
print(na_counts)
##               Variant        loanofficer_id                   day 
##                     0                     0                     0 
##            typeI_init             typeI_fin           typeII_init 
##                     0                     0                     0 
##            typeII_fin            agree_init             agree_fin 
##                     0                     0                     0 
##         conflict_init          conflict_fin        revised_per_ai 
##                     0                     0                     0 
##       revised_agst_ai          fully_complt confidence_init_total 
##                     0                     0                     0 
##  confidence_fin_total           complt_init            complt_fin 
##                     0                     0                     0 
##              ai_typeI             ai_typeII          badloans_num 
##                     0                     0                     0 
##         goodloans_num 
##                     0

Result: found 0 N/A

Check duplicate values

duplicates <- LoanData[duplicated(LoanData), ]
print(duplicates)
## # A tibble: 0 × 22
## # … with 22 variables: Variant <chr>, loanofficer_id <chr>, day <dbl>,
## #   typeI_init <dbl>, typeI_fin <dbl>, typeII_init <dbl>, typeII_fin <dbl>,
## #   agree_init <dbl>, agree_fin <dbl>, conflict_init <dbl>, conflict_fin <dbl>,
## #   revised_per_ai <dbl>, revised_agst_ai <dbl>, fully_complt <dbl>,
## #   confidence_init_total <dbl>, confidence_fin_total <dbl>, complt_init <dbl>,
## #   complt_fin <dbl>, ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>,
## #   goodloans_num <dbl>

Result: No duplicate values

Check unique value of Variant

unique_values_Variant <- sort(unique(LoanData$Variant))  
print(unique_values_Variant)
## [1] "Control"   "Treatment"

Result: There are only 2 variants, which are Control and Treatment, in the data.

Data Exploration

Count Control and Treatment

# Group by Variant and count rows
grouped_counts <- LoanData %>%
  group_by(Variant) %>%
  summarise(Count = n())

# Print the result
print(grouped_counts)
## # A tibble: 2 × 2
##   Variant   Count
##   <chr>     <int>
## 1 Control     190
## 2 Treatment   280
# Plot the grouped counts with labels
ggplot(grouped_counts, aes(x = Variant, y = Count, fill = Variant)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Count), vjust = -0.5, size = 5) +  # Add labels above bars
  labs(title = "Number of Control and Treatment Experiments in Total",
       x = "Variant",
       y = "Count") +
  theme_minimal()

Result : There are 190 controls and 280 treatments in total

# Count the number of each variant (control/treatment) per day
variant_per_day <- LoanData %>%
  group_by(day, Variant) %>%
  summarise(count = n(), .groups = "drop")

# Print the result
print(variant_per_day)
## # A tibble: 20 × 3
##      day Variant   count
##    <dbl> <chr>     <int>
##  1     1 Control      19
##  2     1 Treatment    28
##  3     2 Control      19
##  4     2 Treatment    28
##  5     3 Control      19
##  6     3 Treatment    28
##  7     4 Control      19
##  8     4 Treatment    28
##  9     5 Control      19
## 10     5 Treatment    28
## 11     6 Control      19
## 12     6 Treatment    28
## 13     7 Control      19
## 14     7 Treatment    28
## 15     8 Control      19
## 16     8 Treatment    28
## 17     9 Control      19
## 18     9 Treatment    28
## 19    10 Control      19
## 20    10 Treatment    28
# Create a bar plot
ggplot(variant_per_day, aes(x = factor(day), y = count, fill = Variant)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Count of Control and Treatment per Day", 
       x = "Day", 
       y = "Count", 
       fill = "Variant") +
  theme_minimal()

Result : On each day from day 1 to day 10, 19 control and 28 treatment experiments were conducted.

Check if any loan officers have conducted both control and treatment experiments

# Check if each loan officer has conducted both control and treatment experiments
loan_officer_experiment <- LoanData %>%
  group_by(loanofficer_id) %>%
  summarise(experiment_types = n_distinct(Variant)) %>%
  mutate(Control_and_Treatment = ifelse(experiment_types > 1, "Both Control and Treatment", "Only One experiment"))

# Print the result
print(loan_officer_experiment)
## # A tibble: 47 × 3
##    loanofficer_id experiment_types Control_and_Treatment
##    <chr>                     <int> <chr>                
##  1 0899qxvc                      1 Only One experiment  
##  2 09pij0e2                      1 Only One experiment  
##  3 0g7pi6g8                      1 Only One experiment  
##  4 0gh7r2hr                      1 Only One experiment  
##  5 1ckkyukp                      1 Only One experiment  
##  6 1ha5khxo                      1 Only One experiment  
##  7 2twvlktb                      1 Only One experiment  
##  8 2udootyt                      1 Only One experiment  
##  9 4cdwcblq                      1 Only One experiment  
## 10 530lfgx0                      1 Only One experiment  
## # … with 37 more rows

Result: All loan officers have conducted either control or treatment experiments.

Check total of control or treatment experiments per 1 officer

# Count the number of Control and Treatment experiments per loan officer
experiment_count_per_officer <- LoanData %>%
  group_by(loanofficer_id, Variant) %>%
  summarise(experiment_count = n(), .groups = "drop")

# Print the result
print(experiment_count_per_officer)
## # A tibble: 47 × 3
##    loanofficer_id Variant   experiment_count
##    <chr>          <chr>                <int>
##  1 0899qxvc       Treatment               10
##  2 09pij0e2       Treatment               10
##  3 0g7pi6g8       Control                 10
##  4 0gh7r2hr       Control                 10
##  5 1ckkyukp       Treatment               10
##  6 1ha5khxo       Treatment               10
##  7 2twvlktb       Treatment               10
##  8 2udootyt       Control                 10
##  9 4cdwcblq       Treatment               10
## 10 530lfgx0       Treatment               10
## # … with 37 more rows

Result: All loan officers have conducted either 10 control or 10 treatment experiments.

Check unique value of loanofficer_id

unique_count_loanofficer_id <- length(unique(LoanData$loanofficer_id))
print(unique_count_loanofficer_id)
## [1] 47

Result : Found 47 unique loanofficer id in total

Summary for loan officer id

There are 47 loan officers in total. Each loan officer conducted either Control or Treatment experiments. 19 conducted control experiments, and 28 conducted treatment experiments. Each officer participated in 10 experiments.

Check the total value of badloan and goodloan

LoanData_grouped <- LoanData %>%
  mutate(total_loans = badloans_num + goodloans_num) %>%
  group_by(total_loans) %>%
  summarise(count = n())

# Print the result
print(LoanData_grouped)
## # A tibble: 1 × 2
##   total_loans count
##         <dbl> <int>
## 1          10   470

Result : All rows have a total of 10 bad and good loans combined.

LoanData %>%
  group_by(Variant) %>%
  summarise(avg_good = mean(goodloans_num), avg_bad = mean(badloans_num))
## # A tibble: 2 × 3
##   Variant   avg_good avg_bad
##   <chr>        <dbl>   <dbl>
## 1 Control          7       3
## 2 Treatment        7       3

Result : On average, 7 good loans and 3 bad loans were given to loan officers in both the control and treatment groups during the experiment

Check the total loans that loan officer has made the decision per day in the final stage ( complt_fin )

filtered_complt_fin10 <- LoanData %>%
  filter(complt_fin != 10) %>%
  group_by(Variant) %>%
  summarise(count = n())

# Print the result
print(filtered_complt_fin10)
## # A tibble: 2 × 2
##   Variant   count
##   <chr>     <int>
## 1 Control      97
## 2 Treatment    23

Result : There are 97 control and 23 treatment experiments where fewer than 10 loan decisions were completed per day in the final stage

Check the total value of “agree_fin”, “conflict_fin”

# Sum of agree_fin and conflict_fin
LoanData <- LoanData %>%
  mutate(total_AgreeConflict = agree_fin+conflict_fin)


# Check if any rows have total_loans not equal to 10
rows_AgreeConflict <- LoanData %>%
  filter(total_AgreeConflict ==0)

# Print the rows where the sum of "agree_fin", "conflict_fin" is 0
print(rows_AgreeConflict)
## # A tibble: 90 × 23
##    Variant loano…¹   day typeI…² typeI…³ typeI…⁴ typeI…⁵ agree…⁶ agree…⁷ confl…⁸
##    <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Control 2udoot…     1       2       0       1       0       0       0       0
##  2 Control 2udoot…     2       4       0       1       0       0       0       0
##  3 Control 2udoot…     3       3       0       2       0       0       0       0
##  4 Control 2udoot…     4       3       0       2       0       0       0       0
##  5 Control 2udoot…     5       3       0       0       0       0       0       0
##  6 Control 2udoot…     6       5       0       0       0       0       0       0
##  7 Control 2udoot…     7       5       0       0       0       0       0       0
##  8 Control 2udoot…     8       4       0       1       0       0       0       0
##  9 Control 2udoot…     9       4       0       0       0       0       0       0
## 10 Control 2udoot…    10       2       0       0       0       0       0       0
## # … with 80 more rows, 13 more variables: conflict_fin <dbl>,
## #   revised_per_ai <dbl>, revised_agst_ai <dbl>, fully_complt <dbl>,
## #   confidence_init_total <dbl>, confidence_fin_total <dbl>, complt_init <dbl>,
## #   complt_fin <dbl>, ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>,
## #   goodloans_num <dbl>, total_AgreeConflict <dbl>, and abbreviated variable
## #   names ¹​loanofficer_id, ²​typeI_init, ³​typeI_fin, ⁴​typeII_init, ⁵​typeII_fin,
## #   ⁶​agree_init, ⁷​agree_fin, ⁸​conflict_init

Result : 90 rows with the total value of “agree_fin” and “conflict_fin” equal to 0 which means that there are potential that officer conducted the final decision without seeing any computer model decisions.

Filter rows that complete the decision without using AI at all

# Filter rows based on conditions
filtered_rows <- LoanData %>%
  filter(complt_init > 0 & complt_fin == 0)

# View the filtered rows
print(filtered_rows)
## # A tibble: 90 × 23
##    Variant loano…¹   day typeI…² typeI…³ typeI…⁴ typeI…⁵ agree…⁶ agree…⁷ confl…⁸
##    <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Control 2udoot…     1       2       0       1       0       0       0       0
##  2 Control 2udoot…     2       4       0       1       0       0       0       0
##  3 Control 2udoot…     3       3       0       2       0       0       0       0
##  4 Control 2udoot…     4       3       0       2       0       0       0       0
##  5 Control 2udoot…     5       3       0       0       0       0       0       0
##  6 Control 2udoot…     6       5       0       0       0       0       0       0
##  7 Control 2udoot…     7       5       0       0       0       0       0       0
##  8 Control 2udoot…     8       4       0       1       0       0       0       0
##  9 Control 2udoot…     9       4       0       0       0       0       0       0
## 10 Control 2udoot…    10       2       0       0       0       0       0       0
## # … with 80 more rows, 13 more variables: conflict_fin <dbl>,
## #   revised_per_ai <dbl>, revised_agst_ai <dbl>, fully_complt <dbl>,
## #   confidence_init_total <dbl>, confidence_fin_total <dbl>, complt_init <dbl>,
## #   complt_fin <dbl>, ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>,
## #   goodloans_num <dbl>, total_AgreeConflict <dbl>, and abbreviated variable
## #   names ¹​loanofficer_id, ²​typeI_init, ³​typeI_fin, ⁴​typeII_init, ⁵​typeII_fin,
## #   ⁶​agree_init, ⁷​agree_fin, ⁸​conflict_init

Result : 90 rows were found which all from Control experiments.

Confidence score

Based on our previous found, we also noted that the confidence score in the final stage from rows where sum of “agree_fin”, “conflict_fin” is 0 OR rows where complt_init > 0 & complt_fin == 0 , all have confidence score as 0

### Confidence score 

LoanData %>%
  select(complt_fin,confidence_fin_total)
## # A tibble: 470 × 2
##    complt_fin confidence_fin_total
##         <dbl>                <dbl>
##  1         10                  913
##  2         10                  974
##  3         10                  970
##  4         10                  961
##  5         10                 1000
##  6         10                 1000
##  7         10                 1000
##  8         10                 1000
##  9         10                 1000
## 10         10                 1000
## # … with 460 more rows

####Sum total value of confidence score in the final stage from all rows where sum of “agree_fin”, “conflict_fin” is 0

total_confidence_0 <- rows_AgreeConflict %>%
  summarise(total = sum(confidence_fin_total, na.rm = TRUE))

print(total_confidence_0)
## # A tibble: 1 × 1
##   total
##   <dbl>
## 1     0

####Sum total value of confidence score in the final stage from all rows where complt_init > 0 & complt_fin == 0

total_confidence_withoutAI <- filtered_rows %>%
  summarise(total = sum(confidence_fin_total, na.rm = TRUE))

print(total_confidence_withoutAI)
## # A tibble: 1 × 1
##   total
##   <dbl>
## 1     0

With the total confidence score in the final stage being 0, we assume that for experiments where the sum of agree_fin and conflict_fin is 0, or where complt_init > 0 and complt_fin == 0, loan officers are likely not considering the computer model in their final decision. This may be due to their low confidence in the model or because they did not make any decision in the final stage, resulting in a confidence score of 0

Filter rows that the sum of agree and conflict do not add up to complt_fin AND involves AI in the decision

filtered_agr_conf <- LoanData %>%
  filter((agree_init + conflict_init != complt_init) | (agree_fin + conflict_fin != complt_fin))


filtered_agr_conf_AI <- filtered_agr_conf %>%
  filter(!(complt_init > 0 & complt_fin == 0))

# View the filtered rows
#print(filtered_agr_conf_AI )

filtered_agr_conf_AI %>%
  select(Variant, loanofficer_id, agree_init, conflict_init, complt_init, agree_fin,conflict_fin,complt_fin) %>%
  print()
## # A tibble: 6 × 8
##   Variant   loanofficer_id agree_init conflict…¹ compl…² agree…³ confl…⁴ compl…⁵
##   <chr>     <chr>               <dbl>      <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
## 1 Treatment 92vdohom                9          0      10       7       2       9
## 2 Control   qwun9ha5                4          4      10       6       2       8
## 3 Treatment envu2p1p                6          1       8       6       1       7
## 4 Treatment envu2p1p                3          3      10       6       0       6
## 5 Treatment envu2p1p                5          3      10       6       2       8
## 6 Treatment 9lejzokf                1          7      10       6       2       8
## # … with abbreviated variable names ¹​conflict_init, ²​complt_init, ³​agree_fin,
## #   ⁴​conflict_fin, ⁵​complt_fin

Result : Although the numbers from agree_init and conflict_init do not add up to complt_init, the numbers from agree_fin and conflict_fin do add up to complt_fin. Since our main focus is on the final stage, where loan officers see the computer prediction results, and the mismatches in the initial stage are minor, we have decided to keep these rows unchanged. The inconsistencies in the initial stage may be due to staff forgetting to record the agreement or disagreement results. Additionally, given our already small sample size, removing or modifying these six rows could further reduce the data quality. Therefore, we have chosen to retain them as they are.

Filter out rows that complete the decision without using AI at all, as the A/B testing is focusing on whether the new computer model actually improves the loan officers’ decision quality or not. Thus, the decision without AI will be excluded.

# Remove rows where complt_init > 0 and complt_fin == 0
df <- LoanData %>%
  filter(!(complt_init > 0 & complt_fin == 0))

# View the remaining rows
print(df)
## # A tibble: 380 × 23
##    Variant loano…¹   day typeI…² typeI…³ typeI…⁴ typeI…⁵ agree…⁶ agree…⁷ confl…⁸
##    <chr>   <chr>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Treatm… qamcqd…     1       0       0       2       2       7      10       2
##  2 Treatm… qamcqd…     2       2       2       3       3       8       8       2
##  3 Treatm… qamcqd…     3       3       3       0       0       9       9       1
##  4 Treatm… qamcqd…     4       1       2       1       1       8       9       2
##  5 Treatm… qamcqd…     5       0       2       0       0       8      10       2
##  6 Treatm… qamcqd…     6       0       1       4       0       5      10       5
##  7 Treatm… qamcqd…     7       0       1       1       0       8      10       2
##  8 Treatm… qamcqd…     8       0       3       4       1       4      10       6
##  9 Treatm… qamcqd…     9       0       1       4       1       6      10       4
## 10 Treatm… qamcqd…    10       0       0       2       1       9      10       1
## # … with 370 more rows, 13 more variables: conflict_fin <dbl>,
## #   revised_per_ai <dbl>, revised_agst_ai <dbl>, fully_complt <dbl>,
## #   confidence_init_total <dbl>, confidence_fin_total <dbl>, complt_init <dbl>,
## #   complt_fin <dbl>, ai_typeI <dbl>, ai_typeII <dbl>, badloans_num <dbl>,
## #   goodloans_num <dbl>, total_AgreeConflict <dbl>, and abbreviated variable
## #   names ¹​loanofficer_id, ²​typeI_init, ³​typeI_fin, ⁴​typeII_init, ⁵​typeII_fin,
## #   ⁶​agree_init, ⁷​agree_fin, ⁸​conflict_init
control_count <- sum(df$Variant == "Control")
treatment_count <- sum(df$Variant == "Treatment")

print(control_count)
## [1] 100
print(treatment_count)
## [1] 280

Section two : Data Analysis: Hypothesis Testing

## Set categorical variable
df$Variant <- factor(df$Variant)

OEC no.1 : Type I and Type II Error rate

First, we can evaluate the performance of the new model using the following metrics as OEC (Overall Evaluation Criteria):

  1. Type I Error Rate = False Positives / Total Actual Negatives = typeI_fin / goodloans_num
  • Hypothesis: The new model will reduce the Type I Error Rate – single tailed t-test
  1. Type II Error Rate = False Negatives / Total Actual Positives = typeII_fin / badloans_num
  • Hypothesis: The new model will reduce the Type II Error Rate – single tailed t-test

Both errors are considered after the officer reviews the computer prediction (as it is in the current process).

df1<-df

# Calculate Type I and Type II error rates aggregating the information by loan officer ID 

df1 <- df1 %>% group_by(Variant,loanofficer_id) %>%
  summarise(
    Type_I_Error_Rate = sum(typeI_fin) / sum(goodloans_num),  # False Positives / Total Actual Negatives                                                                                                                                  
    Type_II_Error_Rate = sum(typeII_fin) / sum(badloans_num)  # False Negatives / Total Actual Positives
  )
## `summarise()` has grouped output by 'Variant'. You can override using the
## `.groups` argument.
df1 <- df1 %>%
  mutate(
    Type_I_Error_Rate = ifelse(is.na(Type_I_Error_Rate),0,Type_I_Error_Rate),  # NA with 0
    Type_II_Error_Rate = ifelse(is.na(Type_II_Error_Rate),0,Type_II_Error_Rate)  # NA with 0
  )

Run Welch’s two-sample t-tests to examine if there’s sig. difference between pairs of Variants

Type_I_Error_Rate

t.test(
  Type_I_Error_Rate ~ Variant, 
  data = df1,
  var.equal = FALSE) # assuming samples have unequal variances (using Welch t-test)
## 
##  Welch Two Sample t-test
## 
## data:  Type_I_Error_Rate by Variant
## t = 4.0648, df = 11.019, p-value = 0.001861
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  0.1035696 0.3480630
## sample estimates:
##   mean in group Control mean in group Treatment 
##               0.5028571               0.2770408

The type I error in control is greater than in treatment.

Type_II_Error_Rate

t.test(
  Type_II_Error_Rate ~ Variant, 
  data = df1,
  var.equal = FALSE) # assuming samples have unequal variances (using Welch t-test)
## 
##  Welch Two Sample t-test
## 
## data:  Type_II_Error_Rate by Variant
## t = 3.4409, df = 10.923, p-value = 0.005571
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  0.04514467 0.20580771
## sample estimates:
##   mean in group Control mean in group Treatment 
##               0.3933333               0.2678571

The type II error in control is greater than in treatment.

The analysis of the two metrics (Ratio of Error Type I, and Error Type II) indicates that the new model is performing better than the existing model.

Compute the difference (in percentage) with each OEC between Variants

# Compute mean OEC for each Variant
mean_OEC_each_Variant <- df1 %>%
  group_by(Variant) %>%
  summarise(mean_Type_I_Error_Rate = mean(Type_I_Error_Rate),
            mean_Type_II_Error_Rate = mean(Type_II_Error_Rate))

# View mean OEC
print(mean_OEC_each_Variant)
## # A tibble: 2 × 3
##   Variant   mean_Type_I_Error_Rate mean_Type_II_Error_Rate
##   <fct>                      <dbl>                   <dbl>
## 1 Control                    0.503                   0.393
## 2 Treatment                  0.277                   0.268
# Compute pairwise % differences in OEC between pairs of variants
pairwise_diff <- mean_OEC_each_Variant %>%
  summarise(
    Dif_Type_I_Error_Rate = mean_Type_I_Error_Rate[Variant == "Treatment"] - mean_Type_I_Error_Rate[Variant == "Control"],
    Dif_Type_II_Error_Rate = mean_Type_II_Error_Rate[Variant == "Treatment"] - mean_Type_II_Error_Rate[Variant == "Control"],
    Perc_Type_I_Error_Rate = (Dif_Type_I_Error_Rate / mean_Type_I_Error_Rate[Variant == "Control"]) * 100,
    Perc_Type_II_Error_Rate = (Dif_Type_II_Error_Rate / mean_Type_II_Error_Rate[Variant == "Control"]) * 100
  )

perc_dif <- pairwise_diff %>% select(Perc_Type_I_Error_Rate,Perc_Type_II_Error_Rate)

# View pairwise differences
print(perc_dif)
## # A tibble: 1 × 2
##   Perc_Type_I_Error_Rate Perc_Type_II_Error_Rate
##                    <dbl>                   <dbl>
## 1                  -44.9                   -31.9

Treatment (new model) reduce the type I error compared to Control (existing model) by 44.9%.

Treatment (new model) reduce the type II error compared to Control (existing model) by 31.9%.

Compute & Interpret Effect Size (Cohen’s d)

Effect size: Control vs Treatment

Type_I_Error_Rate

Control = df1$Type_I_Error_Rate[df1$Variant == "Control"]
Treatment = df1$Type_I_Error_Rate[df1$Variant == "Treatment"]

cohens_d(Treatment, Control) # compute effect size of difference between Treatment & Control
## Cohen's d |         95% CI
## --------------------------
## -1.96     | [-2.80, -1.10]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(-1.96)
## [1] "large"
## (Rules: cohen1988)

Treatment (new model) significantly reduced (p < 5.76e-06, d = -1.96) the error type I ratio compared to Control (existing model) by 44.9%.

Type_II_Error_Rate

Control = df1$Type_II_Error_Rate[df1$Variant == "Control"]
Treatment = df1$Type_II_Error_Rate[df1$Variant == "Treatment"]

cohens_d(Treatment, Control) # compute effect size of difference between Treatment & Control
## Cohen's d |         95% CI
## --------------------------
## -1.67     | [-2.48, -0.84]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(-1.67)
## [1] "large"
## (Rules: cohen1988)

Treatment (new model) significantly reduced (p < 6.12e-05, d = -1.67) the error type II ratio compared to Control (existing model) by 31.9%.

Conclusions for Error Rate Type I and II

  • Treatment (new model) significantly reduced (p < 5.76e-06, d = -1.96) the error type I ratio compared to Control (existing model) by 44.9%. The reduction in Type I error with the new model will retain more good loans (not default), increasing the company’s revenue.

  • Treatment (new model) significantly reduced (p < 6.12e-05, d = -1.67) the error type II ratio compared to Control (existing model) by 31.9%. The reduction in Type II error with the new model will help the company minimize financial losses.

  • The positive impact of the new model in reducing Error Types I and II has both statistical and practical significance, taking into account the p-value, Cohen’s d measure, and the percentage change.Therefore, the company should adopt the new model.

Metric no.2 : Ratio of Loan officer’s agreements with computer predictions after seeing computer predictions

# Aggregate data by loanofficer_id and Variant
df2 <- df %>%
  group_by(loanofficer_id, Variant) %>%
  summarise(
    agree_fin = sum(agree_fin, na.rm = TRUE),
    complt_fin = sum(complt_fin, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(agree_ratio = agree_fin / complt_fin)

Run Welch’s two-sample t-tests to examine if there’s sig. difference between pairs of Variants

# Independent t-test
t_test_agreement_ratio <- t.test(agree_ratio ~ Variant, data = df2, var.equal = FALSE)
print(t_test_agreement_ratio)
## 
##  Welch Two Sample t-test
## 
## data:  agree_ratio by Variant
## t = -3.4329, df = 11.383, p-value = 0.005333
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.20213704 -0.04459309
## sample estimates:
##   mean in group Control mean in group Treatment 
##               0.7491300               0.8724951

A t-test showed a statistically significant difference between the control and treatment groups, t(11.38) = -3.43, p = 0.0053. The 95% confidence interval (-0.2021, -0.0446) confirms the difference between the groups. The mean proportion of decisions agreeing with AI predictions was higher in the treatment group (M = 0.8725) than in the control group (M = 0.7491), suggesting that staff using the new AI model were significantly more likely to align their decisions with AI recommendations compared to those using the old AI model.

Compute & Interpret Effect Size (Cohen’s d)

# Effect size (Cohen’s d)
Control_Agreement_Ratio <- df2$agree_ratio[df2$Variant == "Control"]
Treatment_Agreement_Ratio <- df2$agree_ratio[df2$Variant == "Treatment"]
cohens_d(Control_Agreement_Ratio , Treatment_Agreement_Ratio)
## Cohen's d |         95% CI
## --------------------------
## -1.60     | [-2.40, -0.78]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(-1.60)
## [1] "large"
## (Rules: cohen1988)

The effect size, as measured by Cohen’s d = -1.60 (95% CI: [-2.40, -0.78]), suggests a large difference between the control and treatment groups. This indicates that the new AI model had a substantial impact on the proportion of decisions agreeing with AI predictions.

Conclusion A Welch two-sample t-test showed a significant difference between the control and treatment groups, t(11.38) = -3.43, p = 0.0053. The mean proportion of decisions agreeing with AI predictions was higher in the treatment group (M = 0.8725) than in the control group (M = 0.7491). Cohen’s d = -1.60 (95% CI: [-2.40, -0.78]) suggests a large effect size, indicating a substantial impact of the new AI model on decision-making.

Metric no.3 : Loan Officer Confidence

Loan officer confidence

# Hypothesis: Higher confidence_fin_total in Treatment suggests improved model effectiveness

Run Welch’s two-sample t-tests to examine if there’s sig. difference between pairs of Variants

# Independent t-test
t_test_confidence <- t.test(confidence_fin_total ~ Variant, data = df, var.equal = FALSE)
print(t_test_confidence)
## 
##  Welch Two Sample t-test
## 
## data:  confidence_fin_total by Variant
## t = -6.0989, df = 222.19, p-value = 4.687e-09
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -179.29481  -91.72233
## sample estimates:
##   mean in group Control mean in group Treatment 
##                595.1200                730.6286

The p-value is extremely small (p < 0.001), meaning we reject the null hypothesis. There is a statistically significant difference in final confidence between the Control and Treatment groups. Loan officers in the Treatment group have significantly higher confidence in their final decisions compared to the Control group.

Compute & Interpret Effect Size (Cohen’s d)

# Effect size (Cohen’s d)
Control_Confidence <- df$confidence_fin_total[df$Variant == "Control"]
Treatment_Confidence <- df$confidence_fin_total[df$Variant == "Treatment"]
cohens_d(Control_Confidence ,Treatment_Confidence)
## Cohen's d |         95% CI
## --------------------------
## -0.63     | [-0.86, -0.40]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(-0.63)
## [1] "medium"
## (Rules: cohen1988)

Cohen’s d = -0.63 95% Confidence Interval: [-0.86, -0.40] Interpretation: Moderate-to-large effect size This suggests a meaningful practical difference—the AI model enhances confidence levels among loan officers.

Conclusion The p-value is extremely small (p < 0.001), meaning we reject the null hypothesis. There is a statistically significant difference in final confidence between the Control and Treatment groups. Loan officers in the Treatment group have significantly higher confidence in their final decisions compared to the Control group.

Metric no.4 : Revised per AI

# Aggregate data at the loan officer level

df4 <- df %>%
  group_by(loanofficer_id, Variant) %>%
  summarise(revised_ratio = sum(revised_per_ai) / sum(complt_fin), .groups = "drop")

Run Welch’s two-sample t-tests to examine if there’s sig. difference between pairs of Variants

# Perform t-test on the aggregated data
t_test_revised_ratio <- t.test(
  revised_ratio ~ Variant, 
  data = df4,
  var.equal = FALSE 
)

print(t_test_revised_ratio)
## 
##  Welch Two Sample t-test
## 
## data:  revised_ratio by Variant
## t = -2.9706, df = 34.614, p-value = 0.005371
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.10901678 -0.02048277
## sample estimates:
##   mean in group Control mean in group Treatment 
##              0.05545891              0.12020869

A t-test showed a statistically significant difference between the control and treatment groups, t(34.61) = -2.97, p = 0.0054. The 95% confidence interval (-0.1090, -0.0205) confirms the difference between the groups. The mean revised_ratio was higher in the treatment group (M = 0.1202) than in the control group (M = 0.0555), suggesting that staff using the new AI model were significantly more likely to revise decisions following AI predictions compared to those using the old AI model.

Compute & Interpret Effect Size (Cohen’s d)

# Compute Cohen’s d for revised_ratio
Control_Revised_Ratio <- df4$revised_ratio[df4$Variant == "Control"]
Treatment_Revised_Ratio <- df4$revised_ratio[df4$Variant == "Treatment"]
cohens_d(Control_Revised_Ratio,Treatment_Revised_Ratio)
## Cohen's d |         95% CI
## --------------------------
## -0.77     | [-1.51, -0.02]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(-0.77)
## [1] "medium"
## (Rules: cohen1988)

The effect size, as measured by Cohen’s d = -0.77 (95% CI: [-1.51, -0.02]), suggests a moderate difference between the control and treatment groups. This indicates that the new AI model had a meaningful impact on the proportion of decisions revised following AI predictions.

Conclusion : A Welch t-test revealed a significant difference between the control and treatment groups, t(34.61) = -2.97, p = 0.00537. The treatment group had a higher mean proportion of decisions revised (M = 0.1202) compared to the control group (M = 0.0555). Cohen’s d = -0.77 (95% CI: [-1.51, -0.02]) suggests a moderate effect of the new AI model.

UNFILTERED DATA

OEC no.1 : Type II Error rate

Type II Error Rate = False Negatives / Total Actual Positives = typeII_fin / badloans_num - Hypothesis: The new model will reduce the Type II Error Rate – single tailed t-test

df5<-LoanData

# Calculate Type I and Type II error rates aggregating the information by loan officer ID 

df5 <- df5 %>% group_by(Variant,loanofficer_id) %>%
  summarise(
    Type_II_Error_Rate = sum(typeII_fin) / sum(badloans_num)  # False Negatives / Total Actual Positives
  )
## `summarise()` has grouped output by 'Variant'. You can override using the
## `.groups` argument.
df5 <- df5 %>%
  mutate(
    Type_II_Error_Rate = ifelse(is.na(Type_II_Error_Rate),0,Type_II_Error_Rate)  # NA with 0
  )

Run Welch’s two-sample t-tests to examine if there’s sig. difference between pairs of Variants

Type_II_Error_Rate

t.test(
  Type_II_Error_Rate ~ Variant, 
  data = df5,
  var.equal = FALSE) # assuming samples have unequal variances (using Welch t-test)
## 
##  Welch Two Sample t-test
## 
## data:  Type_II_Error_Rate by Variant
## t = -1.1967, df = 19.84, p-value = 0.2455
## alternative hypothesis: true difference in means between group Control and group Treatment is not equal to 0
## 95 percent confidence interval:
##  -0.16694635  0.04526716
## sample estimates:
##   mean in group Control mean in group Treatment 
##               0.2070175               0.2678571

The type II error in control is less than in treatment.

The analysis of the Ratio of Error Type II indicates that the new model is not performing better than the existing model.

Compute & Interpret Effect Size (Cohen’s d)

Effect size: Control vs Treatment

Type_II_Error_Rate

Control = df5$Type_II_Error_Rate[df5$Variant == "Control"]
Treatment = df5$Type_II_Error_Rate[df5$Variant == "Treatment"]

cohens_d(Treatment, Control) # compute effect size of difference between Treatment & Control
## Cohen's d |        95% CI
## -------------------------
## 0.42      | [-0.17, 1.01]
## 
## - Estimated using pooled SD.
effectsize::interpret_cohens_d(0.42)
## [1] "small"
## (Rules: cohen1988)

Treatment (new model) does not significantly reduced (p < 0.163, d = 0.42) the error type II ratio compared to Control (existing model)

Estimate the appropriate sample size

With 2 Variants (2 Samples/Groups)

#Estimate sample size
pwr.t.test(power = .8, # 80% power
           d = 0.2, # Cohen's d 
           sig.level = 0.05, # threshold for p-val
           type = "two.sample") # eg., this is for treatment vs control
## 
##      Two-sample t test power calculation 
## 
##               n = 393.4057
##               d = 0.2
##       sig.level = 0.05
##           power = 0.8
##     alternative = two.sided
## 
## NOTE: n is number in *each* group