Part I: Trail Dataset

The Trial.csv is a dataset with three nominal variables. This describes patients with a particular condition, which was followed over two years. The patient was either successfully treated their condition, or they did not. The description of the columns of the dataset follows:

Column 1: Success: Whether the patient successfully treated their condition - Yes, No

Column 2: Group: What group the patient was in - A, B

Column 3: Year: What year of treatment that patient was in - One, Two

For this data set, our goal was to test if there was a dependence between (X) Group and (Y) Success, with and without the confounding variable (Z) Year.

Summary

Trial = read.csv("Trial.csv", header = TRUE)

Trial_table = table(Trial$Group, Trial$Success)
Trial_table
##    
##      No Yes
##   A  53 131
##   B  55 128
Trial_table_by_yr = table(Trial$Year, Trial$Success)
Trial_table_by_yr
##      
##        No Yes
##   One  53 169
##   Two  55  90
spilt_data = split(Trial, Trial$Year)
Trial_sub_year1 = table(spilt_data$One$Group, spilt_data$One$Success)
Trial_sub_year2 = table(spilt_data$Two$Group, spilt_data$Two$Success)

Analysis

We used the Wald Confidence Interval to calculate the success overall because it gives a range of plausible values for the proportion of success. We chose our alpha to be 0.05. We could have also done a hypothesis test and calculated out a test-statistic and a p-value, but we would have drawn the same conclusion either way.

# Estimate the probability of a successful treatment overall.
success_overall = (Trial_table[3] + Trial_table[4])/sum(Trial_table)
z_95 = 1.96
est_stdev = sqrt((success_overall * (1 - success_overall))/sum(Trial_table))
success_overall - (z_95 * est_stdev) # 0.6590971
## [1] 0.6590971
success_overall + (z_95 * est_stdev) # 0.7523471
## [1] 0.7523471

We are 95% confident the probability of success overall is between 0.659 and 0.752 or between 65.9% and 75.2%.

We used the Wald Confidence Interval for difference in proportions to calculate the probability of a successful treatment, comparing only Groups because it gives a range of plausible values for the difference of two proportions.

# Estimate the probability of a successful treatment, comparing only groups.
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536
n_A = Trial_table[1] + Trial_table[3]
n_B = Trial_table[2] + Trial_table[4]
est_stdev_group = sqrt( ((group_A_success * (1 - group_A_success))/n_A) + ((group_B_success * (1 - group_B_success))/n_B) )

(group_A_success - group_B_success) - (z_95 * est_stdev_group) # -0.08074173
## [1] -0.08074173
(group_A_success - group_B_success) + (z_95 * est_stdev_group) # 0.1057477
## [1] 0.1057477

We are 95% confident the difference in proportions between Groups is between -0.807 and 0.106.

We used the Wald Confidence Interval for difference in proportions to calculate the probability of a successful treatment, comparing only years because it gives a range of plausible values for the difference of two proportions.

# Estimate the probability of a successful treatment, comparing only years.
trial_split = split(Trial, Trial$Year)
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)

by_year1 = (sub_table1[3]+sub_table1[4])/sum(sub_table1) #.76126
by_year2 = (sub_table2[3]+sub_table2[4])/sum(sub_table2) #.6206897

n_y1 = sum(sub_table1)
n_y2 = sum(sub_table2)

est_stdev_year = sqrt( ((by_year1 * (1 - by_year1))/n_y1) + ((by_year2 * (1 - by_year2))/n_y2) )

(by_year1 - by_year2) - (z_95 * est_stdev_year) # 0.04370828
## [1] 0.04370828
(by_year1 - by_year2) + (z_95 * est_stdev_year) # 0.2374349
## [1] 0.2374349

We are 95% confident the difference in proportions between Years is between 0.0437 and 0.237.

To assess the relationship between Group and Success, with and without information from Years, we could use odds ratio confidence interval, relative ratio confidence interval or difference in proportion confidence interval. Any of the three options would work because, although they would not give us the same confidence interval, we will still draw the same conclusion.

# Assess the relationship between Group and Success, with and without information from year.
# without year
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536

odd_groupA = group_A_success/(1 - group_A_success)
odd_groupB = group_B_success/(1 - group_B_success)

theta_group = odd_groupA/odd_groupB
est_stdev_theta1 = sqrt( (1/53) + (1/55) + (1/131) + (1/128) )

lower_theta = log(theta_group) - (z_95 * est_stdev_theta1)
upper_theta = log(theta_group) + (z_95 * est_stdev_theta1)

exp(lower_theta) # 0.6778249
## [1] 0.6778249
exp(upper_theta) # 1.664097
## [1] 1.664097

We are 95% confident the odds ratio interval without years is between 0.677 and 1.66.

# with year
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)

piA_year1 = sub_table1[3]/(sub_table1[1] + sub_table1[3])
piB_year1 = sub_table1[4]/(sub_table1[2] + sub_table1[4])

omegaA_year1 = (piA_year1)/(1 - piA_year1)
omegaB_year1 = piB_year1/(1 - piB_year1)

odd_year1 = omegaA_year1/omegaB_year1

est_stdev_year1 = sqrt((1/sub_table1[1]) + (1/sub_table1[2]) + (1/sub_table1[3]) + (1/sub_table1[4]))

log_theta_lower_year1 = log(odd_year1) - (z_95 * est_stdev_year1) # -0.5439964
log_theta_upper_year1 = log(odd_year1) + (z_95 * est_stdev_year1) # 0.6904904

exp(log_theta_lower_year1) # 0.580424
## [1] 0.580424
exp(log_theta_upper_year1) # 1.994694
## [1] 1.994694

We are 95% confident the odds ratio interval with Years for Year One is between 0.580 and 1.99.

# with year
piA_year2 = sub_table2[3]/(sub_table2[1] + sub_table2[3])
piB_year2 = sub_table2[4]/(sub_table2[2] + sub_table2[4])

omegaA_year2 = (piA_year2)/(1 - piA_year2)
omegaB_year2 = piB_year2/(1 - piB_year2)

odd_year2 = omegaA_year2/omegaB_year2

est_stdev_year2 = sqrt((1/sub_table2[1]) + (1/sub_table2[2]) + (1/sub_table2[3]) + (1/sub_table2[4]))

log_theta_lower_year2 = log(odd_year2) - (z_95 * est_stdev_year2) # -0.6346162
log_theta_upper_year2 = log(odd_year2) + (z_95 * est_stdev_year2) # 0.7073514

exp(log_theta_lower_year2) # 0.5301389
## [1] 0.5301389
exp(log_theta_upper_year2) # 2.028611
## [1] 2.028611

We are 95% confident the odds ratio interval with Years for Year Two is between 0.530 and 2.03.

Interpretation

The probability of successful treatment of condition overall is between 65.9% and 75.2%.

The confidence interval for difference in proportions of Success by Groups does include 0, so the Groups are independent of each other. This means the risk of a successful treatment of condition likely have similar risks.

The confidence interval for difference in proportions of Success by Years does not include 0, so the Years are dependent of each other. This means that the risk of a successful treatment of condition likely have un-similar risks.

The confidence interval for odds ratio without Years does include 1, which means that the Success by Groups are independent of each other. This means the risks of a successful treatment of condition likely have similar risks. The confidence interval for odds ratio for Success in Year One and Year Two does include 1, which means that the Groups are independent of each other. This means the risks of a successful treatment of condition likely have similar risks.

Conclusion

Both the confidence intervals for the Trial data with or without the confounding variable Years gives us the same conclusions that the Groups are independent; therefore, the Trial data is not an example of the Simpson’s Paradox because when we remove the confounding variable from the data, we still draw the same conclusion.

Part II: Students Dataset

Introduction

Our data is based on a poll that asks undergraduate students whether they believe they will get a job once they graduate college. The goal of this analysis is to determine college students’ optimism about entering the work force after graduation. Specifically, we want to see if the answer to the question is dependent on what year in school the student is in. The description of the columns of the dataset follows:

Column 1: year: Was the student a freshman or senior - Freshman, Sophomore, Junior, Senior.

Column 2: job: If they believed they would have a job on graduation - Yes, No

Summary

The data is summarized in the bar plot and table below:

students = read.csv("students.csv")

students_table = table(students$job, students$year)
students_table = students_table[c("Yes", "No"), ] #switching rows so Yes is first 

barplot(students_table, beside = T, col = c("violet", "turquoise2"), ylim = c(0,100),
        main = "Number Yes or No Answers by Year in School", xlab = "Year in School")
legend("topleft", c("No", "Yes"), pch = 15, col = c("violet", "turquoise2"), bty = "n")

yes_prop = sum(students_table[1,])/sum(students_table)
no_prop = sum(students_table[2,])/sum(students_table)
yes_prop
## [1] 0.5147059
no_prop
## [1] 0.4852941

The total proportion of students who answered “yes” is 0.5147 and the proportion of students who answered “no” is 0.4853. This means that just over half of the students polled believe they will get a job after graduating, regardless of what grade they are in.

Analysis

To test for dependence, a hypothesis test was administered with:

\(H_{0}\): all probabilities are equal (independent) vs. \(H_{1}\): at least one probability is not equal to the others (dependent)

Our data followed a multinomial distribution, so we decided to use the Pearson’s Chi Squared Test to calculate our test statistic and p-value. Our results are as follows:

chisq.test(students_table,correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  students_table
## X-squared = 48.781, df = 3, p-value = 1.452e-10
yes_sum = sum(students_table[1,])
no_sum = sum(students_table[2,])

Next, we calculated six pairwise 95% difference of proportion confidence intervals comparing each year in school. We chose this tool to test for dependence because it allowed us to see the direction of the dependency. To correct for the fact that we ran six separate confidence intervals, we changed our alpha from 0.05 to 0.05/6. The results are reported below:

Freshman vs Sophomore:

library(PropCIs)
## Warning: package 'PropCIs' was built under R version 3.2.5
diffscoreci(students_table[1], yes_sum, students_table[3], yes_sum, conf.level = 0.99167)
## 
## 
## 
## data:  
## 
## 99.167 percent confidence interval:
##  -0.03773551  0.16531412

Freshmen vs Juniors

diffscoreci(students_table[1], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
## 
## 
## 
## data:  
## 
## 99.167 percent confidence interval:
##  0.1241998 0.3038225

Freshmen vs Seniors

diffscoreci(students_table[1], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
## 
## 
## 
## data:  
## 
## 99.167 percent confidence interval:
##  -0.06757184  0.13839655

Sophomore vs Juniors

diffscoreci(students_table[3], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
## 
## 
## 
## data:  
## 
## 99.167 percent confidence interval:
##  0.06391829 0.23682063

Sophomore vs Seniors

diffscoreci(students_table[3], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
## 
## 
## 
## data:  
## 
## 99.167 percent confidence interval:
##  -0.12834461  0.07158544

Juniors vs Seniors

diffscoreci(students_table[5], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
## 
## 
## 
## data:  
## 
## 99.167 percent confidence interval:
##  -0.26678311 -0.09059263

Interpretation

As we can see from above, the p-value is very small, so at any level of alpha that we could choose (0.1, 0.05, or 0.01) we would reject \(H_{0}\) and conclude that at least one probability is not equal to the others or, in other words, they are dependent.

After rejecting \(H_{0}\), our next step was to answer the question “how was the data dependent”? To answer that, we calculated pairwise difference of proportion confidence interval for each year in school. Based off of table 3, we are 95% confident that Freshmen are as likely to answer “yes” as Sophomores and Seniors because their confidence intervals covered zero. On the other hand, we are 95% confident that, Freshmen are more likely to say “yes” than Juniors because their confidence interval is strictly positive. Additionally, we are 95% confident that Sophomores and Seniors are more likely than Juniors to say “yes” because the confidence interval is strictly positive. Lastly, we are 95% confident that Sophomores and Seniors are equally like to say “yes” because they cover 0.

Conclusion

The answer to question “are you going to get a job after college” is dependent on what year in school the student is in. Every class is more likely to say “yes” than Juniors. Excluding Juniors, all classes are equally likely to say “yes” or “no”.

Code Appendix

Trial = read.csv("Trial.csv", header = TRUE)

Trial_table = table(Trial$Group, Trial$Success)
Trial_table

Trial_table_by_yr = table(Trial$Year, Trial$Success)
Trial_table_by_yr

spilt_data = split(Trial, Trial$Year)
Trial_sub_year1 = table(spilt_data$One$Group, spilt_data$One$Success)
Trial_sub_year2 = table(spilt_data$Two$Group, spilt_data$Two$Success)
# Estimate the probability of a successful treatment overall.
success_overall = (Trial_table[3] + Trial_table[4])/sum(Trial_table)
z_95 = 1.96
est_stdev = sqrt((success_overall * (1 - success_overall))/sum(Trial_table))
success_overall - (z_95 * est_stdev) # 0.6590971
success_overall + (z_95 * est_stdev) # 0.7523471
# Estimate the probability of a successful treatment, comparing only groups.
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536
n_A = Trial_table[1] + Trial_table[3]
n_B = Trial_table[2] + Trial_table[4]
est_stdev_group = sqrt( ((group_A_success * (1 - group_A_success))/n_A) + ((group_B_success * (1 - group_B_success))/n_B) )

(group_A_success - group_B_success) - (z_95 * est_stdev_group) # -0.08074173
(group_A_success - group_B_success) + (z_95 * est_stdev_group) # 0.1057477
# Estimate the probability of a successful treatment, comparing only years.
trial_split = split(Trial, Trial$Year)
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)

by_year1 = (sub_table1[3]+sub_table1[4])/sum(sub_table1) #.76126
by_year2 = (sub_table2[3]+sub_table2[4])/sum(sub_table2) #.6206897

n_y1 = sum(sub_table1)
n_y2 = sum(sub_table2)

est_stdev_year = sqrt( ((by_year1 * (1 - by_year1))/n_y1) + ((by_year2 * (1 - by_year2))/n_y2) )

(by_year1 - by_year2) - (z_95 * est_stdev_year) # 0.04370828
(by_year1 - by_year2) + (z_95 * est_stdev_year) # 0.2374349
# Assess the relationship between Group and Success, with and without information from year.
# without year
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536

odd_groupA = group_A_success/(1 - group_A_success)
odd_groupB = group_B_success/(1 - group_B_success)

theta_group = odd_groupA/odd_groupB
est_stdev_theta1 = sqrt( (1/53) + (1/55) + (1/131) + (1/128) )

lower_theta = log(theta_group) - (z_95 * est_stdev_theta1)
upper_theta = log(theta_group) + (z_95 * est_stdev_theta1)

exp(lower_theta) # 0.6778249
exp(upper_theta) # 1.664097
# with year
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)

piA_year1 = sub_table1[3]/(sub_table1[1] + sub_table1[3])
piB_year1 = sub_table1[4]/(sub_table1[2] + sub_table1[4])

omegaA_year1 = (piA_year1)/(1 - piA_year1)
omegaB_year1 = piB_year1/(1 - piB_year1)

odd_year1 = omegaA_year1/omegaB_year1

est_stdev_year1 = sqrt((1/sub_table1[1]) + (1/sub_table1[2]) + (1/sub_table1[3]) + (1/sub_table1[4]))

log_theta_lower_year1 = log(odd_year1) - (z_95 * est_stdev_year1) # -0.5439964
log_theta_upper_year1 = log(odd_year1) + (z_95 * est_stdev_year1) # 0.6904904

exp(log_theta_lower_year1) # 0.580424
exp(log_theta_upper_year1) # 1.994694
# with year
piA_year2 = sub_table2[3]/(sub_table2[1] + sub_table2[3])
piB_year2 = sub_table2[4]/(sub_table2[2] + sub_table2[4])

omegaA_year2 = (piA_year2)/(1 - piA_year2)
omegaB_year2 = piB_year2/(1 - piB_year2)

odd_year2 = omegaA_year2/omegaB_year2

est_stdev_year2 = sqrt((1/sub_table2[1]) + (1/sub_table2[2]) + (1/sub_table2[3]) + (1/sub_table2[4]))

log_theta_lower_year2 = log(odd_year2) - (z_95 * est_stdev_year2) # -0.6346162
log_theta_upper_year2 = log(odd_year2) + (z_95 * est_stdev_year2) # 0.7073514

exp(log_theta_lower_year2) # 0.5301389
exp(log_theta_upper_year2) # 2.028611
students = read.csv("students.csv")

students_table = table(students$job, students$year)
students_table = students_table[c("Yes", "No"), ] #switching rows so Yes is first 

barplot(students_table, beside = T, col = c("violet", "turquoise2"), ylim = c(0,100),
        main = "Number Yes or No Answers by Year in School", xlab = "Year in School")
legend("topleft", c("No", "Yes"), pch = 15, col = c("violet", "turquoise2"), bty = "n")

yes_prop = sum(students_table[1,])/sum(students_table)
no_prop = sum(students_table[2,])/sum(students_table)
yes_prop
no_prop
chisq.test(students_table,correct = FALSE)

yes_sum = sum(students_table[1,])
no_sum = sum(students_table[2,])
library(PropCIs)

diffscoreci(students_table[1], yes_sum, students_table[3], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[1], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[1], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[3], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[3], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[5], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)