STATS 60 Rlab Session 6

Yuchen Wu

2020/7/29

Before class

Quiz 3 has been officially released! Due AUG 05 AT 5:00PM.

Recap of Session 5

Agenda for today

Example 1: is the coin fair?

Example 1: is the coin fair?

Example 1: is the coin fair?

Example 1: is the coin fair?

\[ P(X \geq 70 | p = 0.5) \]

Example 1: is the coin fair?

rbinom(n, size, prob)

Example 1: is the coin fair?

rbinom(n = 1, size = 1, prob = 0.5)
## [1] 0
rbinom(n = 10, size = 1, prob = 0.5)
##  [1] 0 0 1 1 1 1 0 1 0 1
rbinom(n = 10, size = 1, prob = 0.5)
##  [1] 1 0 0 1 1 0 1 0 0 1

Example 1: is the coin fair?

library(dplyr)
library(ggplot2)
# set the random seed so that the sample is identical each time
set.seed(1234567899)

#create a data frame with 1 variable representing the number of heads in 100 coin flips 
nSamples <- 500000
sampleData_df <- 
  data.frame(
    n_heads = rbinom(n = nSamples, size = 100, prob = 0.5)
  )

Example 1: is the coin fair?

#plot the data
sampleData_df %>% 
  ggplot(aes(n_heads)) +
  geom_histogram(bins = 100, binwidth = 0.5, center = 0) +
  geom_vline(aes(xintercept = 70), color = "blue") +
  labs(x = "Number of heads")

Example 1: is the coin fair?

sampleData <- sampleData_df$n_heads
#identify number of values where n_heads >= 70
print(paste("Number of random samples >= 70 heads):", sum(sampleData >= 70)))
## [1] "Number of random samples >= 70 heads): 18"
#identify proportion of values where n_heads >= 70
#an approximate p value
print(paste("Proportion of random samples >= 70 heads):", mean(sampleData >= 70)))
## [1] "Proportion of random samples >= 70 heads): 3.6e-05"

Example 1: is the coin fair?

The function pbinom returns the value of the cumulative density function (cdf) of the binomial distribution given a certain random variable q, number of trials (size) and probability of success on each trial (prob).

# P(X <= q | size, prob)
pbinom(q, size, prob) 
#identify p-value for when n_heads >= 70
print(paste("Binomial distribution: p(X>=70|p=0.5):", 1 - pbinom(69, 100, 0.5)))
## [1] "Binomial distribution: p(X>=70|p=0.5): 3.92506982279661e-05"

two sample t-test

t.test(x, y, alternative = c("two.sided", "less", "greater"))
t.test(formula, data, alternative = c("two.sided", "less", "greater"))

alternative: a character string specifying the alternative hypothesis, must be one of “two.sided” (default), “greater” or “less”. You can specify just the initial letter.

Example 2: BMI example

Let’s test a question about the NHANES dataset, using a sample of 250 adult individuals: Is regular physical activity related to body mass index? \[H_0: \mbox{BMI for active people} \geq \mbox{BMI for inactive people}\] \[H_A: \mbox{BMI for active people} < \mbox{BMI for inactive people}\]

Example 2: BMI example

library(NHANES)
# set the random seed so that the sample is identical each time
set.seed(1234567899)

#create an adults only NHANES data frame where BMI and PhysActive are not missing
NHANES_adult <-
  NHANES %>% 
  filter(Age > 18 & !is.na(BMI) & !is.na(PhysActive)) 

# take a sample from the NHANES dataset
sampSize <- 250
NHANES_sample <- sample_n(NHANES_adult, size = sampSize)

Example 2: BMI example

#plot the data
NHANES_sample %>% 
  ggplot(aes(x = PhysActive, y = BMI)) +
  geom_boxplot() + 
  labs(
    x = "Physically active?",
    y = "Body Mass Index (BMI)"
  )

Example 2: BMI example

#summarize the data 
sampleSummary <- 
  NHANES_sample %>% 
  group_by(PhysActive) %>% 
  summarize(
    n = n(),
    mean = mean(BMI), 
    sd = sd(BMI)
  )

print(sampleSummary)
## # A tibble: 2 x 4
##   PhysActive     n  mean    sd
##   <fct>      <int> <dbl> <dbl>
## 1 No           110  30.5  8.86
## 2 Yes          140  28.4  5.58

Example 2: BMI example

# Compute the test statistics
t.test(BMI~PhysActive,data=NHANES_sample,alternative='greater')
## 
##  Welch Two Sample t-test
## 
## data:  BMI by PhysActive
## t = 2.1285, df = 174.4, p-value = 0.01735
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.4594122       Inf
## sample estimates:
##  mean in group No mean in group Yes 
##          30.47691          28.41771

Example 2: BMI example

# Compute the test statistics
active = NHANES_sample$BMI[NHANES_sample$PhysActive == "Yes"]
inactive = NHANES_sample$BMI[NHANES_sample$PhysActive == "No"]
t.test(inactive, active, alternative='greater')
## 
##  Welch Two Sample t-test
## 
## data:  inactive and active
## t = 2.1285, df = 174.4, p-value = 0.01735
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  0.4594122       Inf
## sample estimates:
## mean of x mean of y 
##  30.47691  28.41771

Example 2: BMI example-randomization

Let’s perform permutation on the NHANES_sample data set to assess the distribution of proportions under the null hypothesis that there is no relation between PhysActive and BMI.

ttest <- t.test(BMI~PhysActive,data=NHANES_sample,alternative='greater')
ttestResult <- ttest$statistic
print(ttestResult)
##        t 
## 2.128516

Example 2: BMI example-randomization

nSamples=2500

bmiData=subset(NHANES_sample,select=c(BMI,PhysActive))

bmiDataShuffled=bmiData

ttestresultSim=array(NA,nSamples)

Example 2: BMI example-randomization

for (i in 1:nSamples){
  #shuffle the labels
  bmiDataShuffled$PhysActive=sample(bmiDataShuffled$PhysActive)
  # compute the test statistic
  simResult=t.test(bmiDataShuffled$BMI[bmiDataShuffled$PhysActive=='No'],
                   bmiDataShuffled$BMI[bmiDataShuffled$PhysActive=='Yes'],alternative = "greater")
  ttestresultSim[i]=simResult$statistic
}
ttestresultSim = data.frame(ttestresultSim = ttestresultSim)

Example 2: BMI example-randomization

ttestresultSim %>%
  ggplot(aes(ttestresultSim)) +
  geom_histogram(bins = 200) +
  geom_vline(aes(xintercept = ttestResult), color = "blue") +
  geom_histogram(
    data = ttestresultSim %>% filter(ttestresultSim >= ttestResult), 
    aes(ttestresultSim), 
    bins = 200, 
    fill = "orange"
  ) +
  labs(x = "T stat: BMI difference between groups")

Example 2: BMI example-randomization

#identify number of values greater or equal to ttestResult
print(paste("Number of random samples >= ttestResult):", sum(ttestresultSim >= ttestResult)))
## [1] "Number of random samples >= ttestResult): 33"
#identify proportion of values greater or equal to ttestResult
#an approximate p value
print(paste("Proportion of random samples >= greater or equal to ttestResult):", mean(ttestresultSim >= ttestResult)))
## [1] "Proportion of random samples >= greater or equal to ttestResult): 0.0132"