The Trial.csv is a dataset with three nominal variables. This describes patients with a particular condition, which was followed over two years. The patient was either successfully treated their condition, or they did not. The description of the columns of the dataset follows:
Column 1: Success
: Whether the patient successfully treated their condition - Yes, No
Column 2: Group
: What group the patient was in - A, B
Column 3: Year
: What year of treatment that patient was in - One, Two
For this data set, our goal was to test if there was a dependence between (X
) Group and (Y
) Success, with and without the confounding variable (Z
) Year.
Trial = read.csv("Trial.csv", header = TRUE)
Trial_table = table(Trial$Group, Trial$Success)
Trial_table
##
## No Yes
## A 53 131
## B 55 128
Trial_table_by_yr = table(Trial$Year, Trial$Success)
Trial_table_by_yr
##
## No Yes
## One 53 169
## Two 55 90
spilt_data = split(Trial, Trial$Year)
Trial_sub_year1 = table(spilt_data$One$Group, spilt_data$One$Success)
Trial_sub_year2 = table(spilt_data$Two$Group, spilt_data$Two$Success)
We used the Wald Confidence Interval to calculate the success overall because it gives a range of plausible values for the proportion of success. We chose our alpha to be 0.05. We could have also done a hypothesis test and calculated out a test-statistic and a p-value, but we would have drawn the same conclusion either way.
# Estimate the probability of a successful treatment overall.
success_overall = (Trial_table[3] + Trial_table[4])/sum(Trial_table)
z_95 = 1.96
est_stdev = sqrt((success_overall * (1 - success_overall))/sum(Trial_table))
success_overall - (z_95 * est_stdev) # 0.6590971
## [1] 0.6590971
success_overall + (z_95 * est_stdev) # 0.7523471
## [1] 0.7523471
We are 95% confident the probability of success overall is between 0.659 and 0.752 or between 65.9% and 75.2%.
We used the Wald Confidence Interval for difference in proportions to calculate the probability of a successful treatment, comparing only Groups because it gives a range of plausible values for the difference of two proportions.
# Estimate the probability of a successful treatment, comparing only groups.
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536
n_A = Trial_table[1] + Trial_table[3]
n_B = Trial_table[2] + Trial_table[4]
est_stdev_group = sqrt( ((group_A_success * (1 - group_A_success))/n_A) + ((group_B_success * (1 - group_B_success))/n_B) )
(group_A_success - group_B_success) - (z_95 * est_stdev_group) # -0.08074173
## [1] -0.08074173
(group_A_success - group_B_success) + (z_95 * est_stdev_group) # 0.1057477
## [1] 0.1057477
We are 95% confident the difference in proportions between Groups is between -0.807 and 0.106.
We used the Wald Confidence Interval for difference in proportions to calculate the probability of a successful treatment, comparing only years because it gives a range of plausible values for the difference of two proportions.
# Estimate the probability of a successful treatment, comparing only years.
trial_split = split(Trial, Trial$Year)
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)
by_year1 = (sub_table1[3]+sub_table1[4])/sum(sub_table1) #.76126
by_year2 = (sub_table2[3]+sub_table2[4])/sum(sub_table2) #.6206897
n_y1 = sum(sub_table1)
n_y2 = sum(sub_table2)
est_stdev_year = sqrt( ((by_year1 * (1 - by_year1))/n_y1) + ((by_year2 * (1 - by_year2))/n_y2) )
(by_year1 - by_year2) - (z_95 * est_stdev_year) # 0.04370828
## [1] 0.04370828
(by_year1 - by_year2) + (z_95 * est_stdev_year) # 0.2374349
## [1] 0.2374349
We are 95% confident the difference in proportions between Years is between 0.0437 and 0.237.
To assess the relationship between Group and Success, with and without information from Years, we could use odds ratio confidence interval, relative ratio confidence interval or difference in proportion confidence interval. Any of the three options would work because, although they would not give us the same confidence interval, we will still draw the same conclusion.
# Assess the relationship between Group and Success, with and without information from year.
# without year
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536
odd_groupA = group_A_success/(1 - group_A_success)
odd_groupB = group_B_success/(1 - group_B_success)
theta_group = odd_groupA/odd_groupB
est_stdev_theta1 = sqrt( (1/53) + (1/55) + (1/131) + (1/128) )
lower_theta = log(theta_group) - (z_95 * est_stdev_theta1)
upper_theta = log(theta_group) + (z_95 * est_stdev_theta1)
exp(lower_theta) # 0.6778249
## [1] 0.6778249
exp(upper_theta) # 1.664097
## [1] 1.664097
We are 95% confident the odds ratio interval without years is between 0.677 and 1.66.
# with year
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)
piA_year1 = sub_table1[3]/(sub_table1[1] + sub_table1[3])
piB_year1 = sub_table1[4]/(sub_table1[2] + sub_table1[4])
omegaA_year1 = (piA_year1)/(1 - piA_year1)
omegaB_year1 = piB_year1/(1 - piB_year1)
odd_year1 = omegaA_year1/omegaB_year1
est_stdev_year1 = sqrt((1/sub_table1[1]) + (1/sub_table1[2]) + (1/sub_table1[3]) + (1/sub_table1[4]))
log_theta_lower_year1 = log(odd_year1) - (z_95 * est_stdev_year1) # -0.5439964
log_theta_upper_year1 = log(odd_year1) + (z_95 * est_stdev_year1) # 0.6904904
exp(log_theta_lower_year1) # 0.580424
## [1] 0.580424
exp(log_theta_upper_year1) # 1.994694
## [1] 1.994694
We are 95% confident the odds ratio interval with Years for Year One is between 0.580 and 1.99.
# with year
piA_year2 = sub_table2[3]/(sub_table2[1] + sub_table2[3])
piB_year2 = sub_table2[4]/(sub_table2[2] + sub_table2[4])
omegaA_year2 = (piA_year2)/(1 - piA_year2)
omegaB_year2 = piB_year2/(1 - piB_year2)
odd_year2 = omegaA_year2/omegaB_year2
est_stdev_year2 = sqrt((1/sub_table2[1]) + (1/sub_table2[2]) + (1/sub_table2[3]) + (1/sub_table2[4]))
log_theta_lower_year2 = log(odd_year2) - (z_95 * est_stdev_year2) # -0.6346162
log_theta_upper_year2 = log(odd_year2) + (z_95 * est_stdev_year2) # 0.7073514
exp(log_theta_lower_year2) # 0.5301389
## [1] 0.5301389
exp(log_theta_upper_year2) # 2.028611
## [1] 2.028611
We are 95% confident the odds ratio interval with Years for Year Two is between 0.530 and 2.03.
The probability of successful treatment of condition overall is between 65.9% and 75.2%.
The confidence interval for difference in proportions of Success by Groups does include 0, so the Groups are independent of each other. This means the risk of a successful treatment of condition likely have similar risks.
The confidence interval for difference in proportions of Success by Years does not include 0, so the Years are dependent of each other. This means that the risk of a successful treatment of condition likely have un-similar risks.
The confidence interval for odds ratio without Years does include 1, which means that the Success by Groups are independent of each other. This means the risks of a successful treatment of condition likely have similar risks. The confidence interval for odds ratio for Success in Year One and Year Two does include 1, which means that the Groups are independent of each other. This means the risks of a successful treatment of condition likely have similar risks.
Both the confidence intervals for the Trial data with or without the confounding variable Years gives us the same conclusions that the Groups are independent; therefore, the Trial data is not an example of the Simpson’s Paradox because when we remove the confounding variable from the data, we still draw the same conclusion.
Our data is based on a poll that asks undergraduate students whether they believe they will get a job once they graduate college. The goal of this analysis is to determine college students’ optimism about entering the work force after graduation. Specifically, we want to see if the answer to the question is dependent on what year in school the student is in. The description of the columns of the dataset follows:
Column 1: year
: Was the student a freshman or senior - Freshman, Sophomore, Junior, Senior.
Column 2: job
: If they believed they would have a job on graduation - Yes, No
The data is summarized in the bar plot and table below:
students = read.csv("students.csv")
students_table = table(students$job, students$year)
students_table = students_table[c("Yes", "No"), ] #switching rows so Yes is first
barplot(students_table, beside = T, col = c("violet", "turquoise2"), ylim = c(0,100),
main = "Number Yes or No Answers by Year in School", xlab = "Year in School")
legend("topleft", c("No", "Yes"), pch = 15, col = c("violet", "turquoise2"), bty = "n")
yes_prop = sum(students_table[1,])/sum(students_table)
no_prop = sum(students_table[2,])/sum(students_table)
yes_prop
## [1] 0.5147059
no_prop
## [1] 0.4852941
The total proportion of students who answered “yes” is 0.5147 and the proportion of students who answered “no” is 0.4853. This means that just over half of the students polled believe they will get a job after graduating, regardless of what grade they are in.
To test for dependence, a hypothesis test was administered with:
\(H_{0}\): all probabilities are equal (independent) vs. \(H_{1}\): at least one probability is not equal to the others (dependent)
Our data followed a multinomial distribution, so we decided to use the Pearson’s Chi Squared Test to calculate our test statistic and p-value. Our results are as follows:
chisq.test(students_table,correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: students_table
## X-squared = 48.781, df = 3, p-value = 1.452e-10
yes_sum = sum(students_table[1,])
no_sum = sum(students_table[2,])
Next, we calculated six pairwise 95% difference of proportion confidence intervals comparing each year in school. We chose this tool to test for dependence because it allowed us to see the direction of the dependency. To correct for the fact that we ran six separate confidence intervals, we changed our alpha from 0.05 to 0.05/6. The results are reported below:
Freshman vs Sophomore:
library(PropCIs)
## Warning: package 'PropCIs' was built under R version 3.2.5
diffscoreci(students_table[1], yes_sum, students_table[3], yes_sum, conf.level = 0.99167)
##
##
##
## data:
##
## 99.167 percent confidence interval:
## -0.03773551 0.16531412
Freshmen vs Juniors
diffscoreci(students_table[1], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
##
##
##
## data:
##
## 99.167 percent confidence interval:
## 0.1241998 0.3038225
Freshmen vs Seniors
diffscoreci(students_table[1], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
##
##
##
## data:
##
## 99.167 percent confidence interval:
## -0.06757184 0.13839655
Sophomore vs Juniors
diffscoreci(students_table[3], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
##
##
##
## data:
##
## 99.167 percent confidence interval:
## 0.06391829 0.23682063
Sophomore vs Seniors
diffscoreci(students_table[3], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
##
##
##
## data:
##
## 99.167 percent confidence interval:
## -0.12834461 0.07158544
Juniors vs Seniors
diffscoreci(students_table[5], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
##
##
##
## data:
##
## 99.167 percent confidence interval:
## -0.26678311 -0.09059263
As we can see from above, the p-value is very small, so at any level of alpha that we could choose (0.1, 0.05, or 0.01) we would reject \(H_{0}\) and conclude that at least one probability is not equal to the others or, in other words, they are dependent.
After rejecting \(H_{0}\), our next step was to answer the question “how was the data dependent”? To answer that, we calculated pairwise difference of proportion confidence interval for each year in school. Based off of table 3, we are 95% confident that Freshmen are as likely to answer “yes” as Sophomores and Seniors because their confidence intervals covered zero. On the other hand, we are 95% confident that, Freshmen are more likely to say “yes” than Juniors because their confidence interval is strictly positive. Additionally, we are 95% confident that Sophomores and Seniors are more likely than Juniors to say “yes” because the confidence interval is strictly positive. Lastly, we are 95% confident that Sophomores and Seniors are equally like to say “yes” because they cover 0.
The answer to question “are you going to get a job after college” is dependent on what year in school the student is in. Every class is more likely to say “yes” than Juniors. Excluding Juniors, all classes are equally likely to say “yes” or “no”.
Trial = read.csv("Trial.csv", header = TRUE)
Trial_table = table(Trial$Group, Trial$Success)
Trial_table
Trial_table_by_yr = table(Trial$Year, Trial$Success)
Trial_table_by_yr
spilt_data = split(Trial, Trial$Year)
Trial_sub_year1 = table(spilt_data$One$Group, spilt_data$One$Success)
Trial_sub_year2 = table(spilt_data$Two$Group, spilt_data$Two$Success)
# Estimate the probability of a successful treatment overall.
success_overall = (Trial_table[3] + Trial_table[4])/sum(Trial_table)
z_95 = 1.96
est_stdev = sqrt((success_overall * (1 - success_overall))/sum(Trial_table))
success_overall - (z_95 * est_stdev) # 0.6590971
success_overall + (z_95 * est_stdev) # 0.7523471
# Estimate the probability of a successful treatment, comparing only groups.
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536
n_A = Trial_table[1] + Trial_table[3]
n_B = Trial_table[2] + Trial_table[4]
est_stdev_group = sqrt( ((group_A_success * (1 - group_A_success))/n_A) + ((group_B_success * (1 - group_B_success))/n_B) )
(group_A_success - group_B_success) - (z_95 * est_stdev_group) # -0.08074173
(group_A_success - group_B_success) + (z_95 * est_stdev_group) # 0.1057477
# Estimate the probability of a successful treatment, comparing only years.
trial_split = split(Trial, Trial$Year)
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)
by_year1 = (sub_table1[3]+sub_table1[4])/sum(sub_table1) #.76126
by_year2 = (sub_table2[3]+sub_table2[4])/sum(sub_table2) #.6206897
n_y1 = sum(sub_table1)
n_y2 = sum(sub_table2)
est_stdev_year = sqrt( ((by_year1 * (1 - by_year1))/n_y1) + ((by_year2 * (1 - by_year2))/n_y2) )
(by_year1 - by_year2) - (z_95 * est_stdev_year) # 0.04370828
(by_year1 - by_year2) + (z_95 * est_stdev_year) # 0.2374349
# Assess the relationship between Group and Success, with and without information from year.
# without year
group_A_success = Trial_table[3] / (Trial_table[1] + Trial_table[3]) # 0.7119565
group_B_success = Trial_table[4] / (Trial_table[2] + Trial_table[4]) # 0.6994536
odd_groupA = group_A_success/(1 - group_A_success)
odd_groupB = group_B_success/(1 - group_B_success)
theta_group = odd_groupA/odd_groupB
est_stdev_theta1 = sqrt( (1/53) + (1/55) + (1/131) + (1/128) )
lower_theta = log(theta_group) - (z_95 * est_stdev_theta1)
upper_theta = log(theta_group) + (z_95 * est_stdev_theta1)
exp(lower_theta) # 0.6778249
exp(upper_theta) # 1.664097
# with year
sub_table1 = table(trial_split$One$Group, trial_split$One$Success)
sub_table2 = table(trial_split$Two$Group, trial_split$Two$Success)
piA_year1 = sub_table1[3]/(sub_table1[1] + sub_table1[3])
piB_year1 = sub_table1[4]/(sub_table1[2] + sub_table1[4])
omegaA_year1 = (piA_year1)/(1 - piA_year1)
omegaB_year1 = piB_year1/(1 - piB_year1)
odd_year1 = omegaA_year1/omegaB_year1
est_stdev_year1 = sqrt((1/sub_table1[1]) + (1/sub_table1[2]) + (1/sub_table1[3]) + (1/sub_table1[4]))
log_theta_lower_year1 = log(odd_year1) - (z_95 * est_stdev_year1) # -0.5439964
log_theta_upper_year1 = log(odd_year1) + (z_95 * est_stdev_year1) # 0.6904904
exp(log_theta_lower_year1) # 0.580424
exp(log_theta_upper_year1) # 1.994694
# with year
piA_year2 = sub_table2[3]/(sub_table2[1] + sub_table2[3])
piB_year2 = sub_table2[4]/(sub_table2[2] + sub_table2[4])
omegaA_year2 = (piA_year2)/(1 - piA_year2)
omegaB_year2 = piB_year2/(1 - piB_year2)
odd_year2 = omegaA_year2/omegaB_year2
est_stdev_year2 = sqrt((1/sub_table2[1]) + (1/sub_table2[2]) + (1/sub_table2[3]) + (1/sub_table2[4]))
log_theta_lower_year2 = log(odd_year2) - (z_95 * est_stdev_year2) # -0.6346162
log_theta_upper_year2 = log(odd_year2) + (z_95 * est_stdev_year2) # 0.7073514
exp(log_theta_lower_year2) # 0.5301389
exp(log_theta_upper_year2) # 2.028611
students = read.csv("students.csv")
students_table = table(students$job, students$year)
students_table = students_table[c("Yes", "No"), ] #switching rows so Yes is first
barplot(students_table, beside = T, col = c("violet", "turquoise2"), ylim = c(0,100),
main = "Number Yes or No Answers by Year in School", xlab = "Year in School")
legend("topleft", c("No", "Yes"), pch = 15, col = c("violet", "turquoise2"), bty = "n")
yes_prop = sum(students_table[1,])/sum(students_table)
no_prop = sum(students_table[2,])/sum(students_table)
yes_prop
no_prop
chisq.test(students_table,correct = FALSE)
yes_sum = sum(students_table[1,])
no_sum = sum(students_table[2,])
library(PropCIs)
diffscoreci(students_table[1], yes_sum, students_table[3], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[1], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[1], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[3], yes_sum, students_table[5], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[3], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)
diffscoreci(students_table[5], yes_sum, students_table[7], yes_sum, conf.level = 0.99167)