Goodness of Fit Tests

Author

Dr. Cohen

Speed Data

\(H_0:\) speed data is normally distributed

\(H_1:\) speed data is not normally distributed

str(cars)
'data.frame':   50 obs. of  2 variables:
 $ speed: num  4 4 7 7 8 9 10 10 10 11 ...
 $ dist : num  2 10 4 22 16 10 18 26 34 17 ...
hist(cars$speed)

mean_speed = mean(cars$speed)
mean_speed
[1] 15.4
sd_speed = sd(cars$speed)
sd_speed
[1] 5.287644

Shapiro Wilk Test for Normality

shapiro.test(cars$speed)

    Shapiro-Wilk normality test

data:  cars$speed
W = 0.97765, p-value = 0.4576

Decision: P-value is greater than 0.05, then we fail to reject \(H_0\). There is evidence to support that speed data is normally distributed with mean = 15.4 and variance = 27.9591837.

Lilliefors Test for Normality

library(nortest)
lillie.test(cars$speed)

    Lilliefors (Kolmogorov-Smirnov) normality test

data:  cars$speed
D = 0.068539, p-value = 0.8068

Decision: P-value is greater than 0.05, then we fail to reject \(H_0\). There is evidence to support that speed data is normally distributed with mean = 15.4 and variance = 27.9591837.

Kolmogorov Smirnov Test

# Test for N(mean_speed,sd_speed)
ks.test(cars$speed,'pnorm',mean_speed,sd_speed)
Warning in ks.test.default(cars$speed, "pnorm", mean_speed, sd_speed): ties
should not be present for the Kolmogorov-Smirnov test

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  cars$speed
D = 0.068539, p-value = 0.9729
alternative hypothesis: two-sided
# Test for N(0,1)
ks.test(cars$speed,'pnorm')
Warning in ks.test.default(cars$speed, "pnorm"): ties should not be present for
the Kolmogorov-Smirnov test

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  cars$speed
D = 0.99997, p-value < 2.2e-16
alternative hypothesis: two-sided

Chi-squared Test

h = hist(cars$speed) # hist of data

Ob = h$counts # observed frequencies in classes


p1 = pnorm(5,mean_speed,sd_speed)# P(X <= 5)
p2 = pnorm(10,mean_speed,sd_speed)-pnorm(5,mean_speed,sd_speed) # P(5<=X <= 10)
p3 = pnorm(15,mean_speed,sd_speed)-pnorm(10,mean_speed,sd_speed) # P(10<=X <= 15)
p4 = pnorm(20,mean_speed,sd_speed)-pnorm(15,mean_speed,sd_speed) # P(15<=X <= 20)
p5 = 1- pnorm(20,mean_speed,sd_speed) # P(X> 20)

Pj = c(p1,p2,p3,p4,p5) # put everything in one array
sum(Pj) # is it = 1?
[1] 1
Ej = Pj*50 # Expected counts
Ej
[1]  1.230015  6.448401 15.814062 16.899365  9.608158
# Chi-squared test for GOF
chisq.test(x=Ob,p = Pj)
Warning in chisq.test(x = Ob, p = Pj): Chi-squared approximation may be
incorrect

    Chi-squared test for given probabilities

data:  Ob
X-squared = 1.3267, df = 4, p-value = 0.8568

The degrees of freedom of the test is \(df = C-1-k = 5 -1 - 2 = 2\). K=2 because we did estimate the mean and variance from the sample.

Adjust for degrees of freedom:

pvalue <- 1-pchisq(1.3267,df=2)
pvalue
[1] 0.5151228

Example with Binomial distribution

Binomial Example : pp.244 18 baseball players with 45 times at-bat. We have the number of hits.

# hits <=7 8 9 10 11 12 13 14 15 16 17
# players 1 1 1 5 2 1 1 2 1 1 1

Consider X is the # hits, X \(\sim\) binom(n= 45, p=? )

Ob=c(1,1,1,5,2,1,1,2,1,1,1,1)

# Estimate p- probability that a player will get a hit at-bat.
# p = #hits/total #of at-bats
p= sum((7:18)*Ob) / (18*45)

p1 <- pbinom(7,45,prob = p)
p2 <- dbinom(8:17,45,prob = p)
p12 <- pbinom(17,45,p,lower.tail = F)

Pj <- c(p1,p2,p12)

Ej=Pj*18

# Chi-squared test 
chisq.test(x=Ob,p=Pj)
Warning in chisq.test(x = Ob, p = Pj): Chi-squared approximation may be
incorrect

    Chi-squared test for given probabilities

data:  Ob
X-squared = 6.7313, df = 11, p-value = 0.8204

Since the expected values are less than 5. We combine classes.

Ob=c(8,6,4)
p1 <- pbinom(10,45,prob = p) # P(X<=10)
p2 <- sum(dbinom(11:14,45,prob = p)) # P(11<=X<=14)
p3 <- pbinom(14,45,p,lower.tail = F)# P(X>=15)
Pj <- c(p1,p2,p3)
sum(Pj)
[1] 1
Ej=Pj*18
Ej
[1] 5.769286 8.770815 3.459899
# Chi-squared test 
chisq.test(x=Ob,p=Pj)
Warning in chisq.test(x = Ob, p = Pj): Chi-squared approximation may be
incorrect

    Chi-squared test for given probabilities

data:  Ob
X-squared = 1.8222, df = 2, p-value = 0.4021

The degrees of freedom of the test is \(df = C-1-k = 3 -1 - 1 = 1\). K=1 because we estimated the probability of hitting at bat from the sample.

Adjust for degrees of freedom:

pvalue <- 1-pchisq(1.8222,df=1)
pvalue
[1] 0.1770516

Simulation Power Analysis - Example Code

We consider data from a gamma distribution with shape = 1.5 and scale = 1.5.

library(nortest)
library(tidyverse)


# function that runs the tests 
find_pvalues =function(n){
  
x = rgamma(n,shape = 1.5,scale = 1.5) # gamma data

sw = shapiro.test(x) # shapiro-wilk test
ks = ks.test(x,'pnorm',mean(x),sd(x)) # k-s test
lf = lillie.test(x) # lilliefors test

h=hist(x,nclass = sqrt(n),plot=FALSE)
ob=h$counts
xv=h$breaks
pj = c(pnorm(xv[2],mean(x),sd(x)),
       diff(pnorm(xv[2:(length(xv)-1)],mean(x),sd(x))),
       pnorm(xv[length(xv)-1],mean(x),sd(x),lower.tail = F))
chi = chisq.test(ob,p = pj) # chi-square test
# need to adjust df

pvalues = c(SW = sw$p.value < 0.05, 
            KS = ks$p.value< 0.05, 
            LF=lf$p.value < 0.05, 
            Chi = chi$p.value < 0.05)
return(pvalues)
}

# n values : sample sizes
n_values= c(5,10,50,100)
# find p_values for different sample sizes
p_val = map_df(n_values,find_pvalues)

# repeat 1000 times
results = replicate(1000,map(n_values,find_pvalues))

# better results format
df.results = bind_rows(results) %>% 
  mutate(n=gl(n = length(n_values),k=1,length = length(results),labels = n_values))

# compute the power
power_tests = df.results %>% 
  group_by(n) %>% 
  summarise_all(mean)
power_tests
# A tibble: 4 × 5
  n        SW    KS    LF   Chi
  <fct> <dbl> <dbl> <dbl> <dbl>
1 5     0.125 0     0.108 0    
2 10    0.31  0.001 0.204 0.069
3 50    0.989 0.166 0.843 0.722
4 100   1     0.557 0.992 0.964
# plot the power
power_tests %>% 
  pivot_longer(cols = -n, names_to = "test", values_to = "power") %>%
  ggplot(aes(x=n,y=power,group=test,col=test)) +
  geom_point()+
  geom_line()