# load all libraries for this tutorial
library(datarium)
library(dplyr)
library(emmeans)

Tutorial นี้จะแสดงวิธีการวิเคราะห์ t-test และ one-way ANOVA ในรูปแบบของสมการถดถอย

t-test

ในตัวอย่าง t-test เราจะใช้ข้อมูล genderweight จาก datarium package

ตัวแปรต่าง ๆ ถูกเข้ารหัสมาเป็น factor เรียบร้อยแล้ว

data(genderweight)
str(genderweight)

## tibble [40 × 3] (S3: tbl_df/tbl/data.frame)
##  $ id    : Factor w/ 40 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ group : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ weight: num [1:40] 61.6 64.6 66.2 59.3 64.9 ...

เราสามารถเขียนคำสั่ง t.test ได้สองแบบ คือแบบ formula หรือแบบเทียบ vector (เช่น เทียบกลุ่มหญิงกับชาย)

g <- t.test(weight ~ group, data = genderweight, var.equal = TRUE) # formula
g

## 
##  Two Sample t-test
## 
## data:  weight by group
## t = -20.791, df = 38, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -24.50140 -20.15349
## sample estimates:
## mean in group F mean in group M 
##        63.49867        85.82612

# use with() to create a data environment so that we do not have to type data frame name every time in the code.
with(genderweight, t.test(weight[group == "F"], weight[group == "M"], var.equal = TRUE)) # compare vector

## 
##  Two Sample t-test
## 
## data:  weight[group == "F"] and weight[group == "M"]
## t = -20.791, df = 38, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -24.50140 -20.15349
## sample estimates:
## mean of x mean of y 
##  63.49867  85.82612

เมื่อวิเคราะห์ด้วย t.test เราจะพบว่าค่าเฉลี่ยกลุ่มผู้หญิง = 63.4986719 และค่าเฉลี่ยกลุ่มผู้ชาย = 85.8261153 ความแตกต่างของค่าเฉลี่ย = 22.3274435

ค่าสถิติทดสอบ t = -20.7913777 พบผลว่าแตกต่างกันอย่างมีนัยสำคัญทางสถิติ

Manual coding

โดยปกติแล้ว หากนำตัวแปรที่เป็น factor ใส่ในสมการเส้นตรง R จะทำการ dummy code ให้อัตโนมัติโดยใช้ level แรกเป็นกลุ่มอ้างอิง

อย่างไรก็ดี เราสามารถเขียน dummy code ได้เอง โดยใช้คำสั่ง recode จาก dplyr package โดยกำหนดให้ผู้หญิงเป็นกลุ่มอ้างอิง

genderweight$dummy_gender <- dplyr::recode(genderweight$group, 'F' = 0, 'M' = 1)
str(genderweight) #notice that the dummy variable was treated as numeric

## tibble [40 × 4] (S3: tbl_df/tbl/data.frame)
##  $ id          : Factor w/ 40 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ group       : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ weight      : num [1:40] 61.6 64.6 66.2 59.3 64.9 ...
##  $ dummy_gender: num [1:40] 0 0 0 0 0 0 0 0 0 0 ...

Regression model

การสร้างสมการเส้นตรงทำโดยใช้คำสั่ง lm(formula, data) โดยเราจะใช้ตัวแปร $dummy_gender ที่เพิ่งสร้างเป็นตัวแปรทำนาย

mod1 <- lm(weight ~ dummy_gender, data = genderweight)
summary(mod1)

## 
## Call:
## lm(formula = weight ~ dummy_gender, data = genderweight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8163 -1.3647 -0.4869  1.3980  9.2365 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   63.4987     0.7593   83.62   <2e-16 ***
## dummy_gender  22.3274     1.0739   20.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.396 on 38 degrees of freedom
## Multiple R-squared:  0.9192, Adjusted R-squared:  0.9171 
## F-statistic: 432.3 on 1 and 38 DF,  p-value: < 2.2e-16

ผลจากการวิเคราะห์ regression จะแสดงการทดสอบค่าคงที่และความชันของตัวแปรแต่ละตัว

(Intercept) คือ ค่าทำนายตัวแปรตามเมื่อตัวแปรอิสระ (dummy_gender) เป็น 0 ในที่นี้คือ ค่าเฉลี่ยของเงื่อนไขอ้างอิง ("F")

Coefficient estimate ของ dummy_gender คือ ค่าความชัน หรือ ค่าทำนายตัวแปรตามที่เปลี่ยนไปเมื่อตัวแปรอิสระเปลี่ยนไป 1 หน่วย การที่ตัวแปรอิสระ dummy_gender เปลี่ยนไป 1 หน่วย (คือ 0 เป็น 1) คือการเปลี่ยนจากกลุ่ม "F" เป็น "M" ดังนั้นค่าความชัน ก็คือ ความแตกต่างของค่าเฉลี่ยทั้งสองกลุ่มนั่นเอง

ค่า t value ของ dummy_gender ก็มีค่าเท่ากับการทดสอบ t-test ด้านบน

นอกจากนี้ output ของ regression ยังมีค่าทดสอบ F มาด้วย ซึ่งจะมีค่าเท่ากับการวิเคราะห์ ANOVA ตามปกติ

summary(aov(weight ~ group, data = genderweight))

##             Df Sum Sq Mean Sq F value Pr(>F)    
## group        1   4985    4985   432.3 <2e-16 ***
## Residuals   38    438      12                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Default coding in R

ในทางปฏิบัติการสร้าง dummy coding เองนั้นอาจจะเสียเวลามาก การตั้งค่าพื้นฐานใน R จึงรองรับการใช้ตัวแปร factor ในสมการ และจะทำการ code รหัสให้เราอัตโนมัติ โดยใช้ level แรกของตัวแปร factor นั้น เป็นกลุ่มอ้างอิง (ในกรณีนี้คือ "F")

สังเกตว่าผลที่ได้เหมือนกับการวิเคราะห์ด้วย dummy code ที่สร้างเองทุกประการ

str(genderweight)

## tibble [40 × 4] (S3: tbl_df/tbl/data.frame)
##  $ id          : Factor w/ 40 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ group       : Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
##  $ weight      : num [1:40] 61.6 64.6 66.2 59.3 64.9 ...
##  $ dummy_gender: num [1:40] 0 0 0 0 0 0 0 0 0 0 ...

mod2 <- lm(weight ~ group, data = genderweight) # use $group in the formula
summary(mod2) # notice that a coefficient showed which group it represented

## 
## Call:
## lm(formula = weight ~ group, data = genderweight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8163 -1.3647 -0.4869  1.3980  9.2365 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  74.6624     0.5369  139.05   <2e-16 ***
## group1      -11.1637     0.5369  -20.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.396 on 38 degrees of freedom
## Multiple R-squared:  0.9192, Adjusted R-squared:  0.9171 
## F-statistic: 432.3 on 1 and 38 DF,  p-value: < 2.2e-16

Changing levels

หากต้องการจะเปลี่ยนระดับที่เป็นกลุ่มอ้างอิง สามารถทำได้โดยกำหนด level ของตัวแปรนั้นใหม่ ในที่นี้ เราจะสร้างตัวแปรชื่อ $gender2 ขึ้นมา แล้วใช้ option level ในคำสั่ง factor() เพื่อกำหนดให้ "M" เป็นระดับแรก และ "F" เป็นระดับสอง

genderweight$gender2 <- factor(genderweight$group, levels = c("M", "F"))
mod3 <- lm(weight ~ gender2, data = genderweight)
summary(mod3)

## 
## Call:
## lm(formula = weight ~ gender2, data = genderweight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8163 -1.3647 -0.4869  1.3980  9.2365 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  74.6624     0.5369  139.05   <2e-16 ***
## gender21     11.1637     0.5369   20.79   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.396 on 38 degrees of freedom
## Multiple R-squared:  0.9192, Adjusted R-squared:  0.9171 
## F-statistic: 432.3 on 1 and 38 DF,  p-value: < 2.2e-16

สังเกตว่า ค่าสถิติในภาพรวมอย่างค่า F และค่า t ไม่เปลี่ยนแปลง (ค่า t เปลี่ยนเฉพาะสัญลักษณ์จากบวกเป็นลบ) เนื่องจากเป็นการวิเคราะห์โมเดลเดียวกัน แต่ค่า intercept และ slope เปลี่ยนตามลำดับในการเข้ารหัส

ค่า intercept จะกลายเป็นค่าเฉลี่ยของกลุ่ม "M" ซึ่งเป็นกลุ่มอ้างอิงในครั้งนี้ และค่า slope คือ การเปลี่ยนจาก "M" เป็น "F" ซึ่งเป็นการเปรียบเทียบกลุ่มค่าเฉลี่ยสูงไปค่าเฉลี่ยต่ำกว่า ดังนั้น ค่า slope จึงติดลบ แต่ยังมีขนาดความแตกต่างเท่าเดิม (การเปลี่ยนสัญลักษณ์นี้ ทำให้ค่า t เปลี่ยนสัญลักษณ์ตามไปด้วย)

ANVOA as regression

การวิเคราะห์ ANOVA ก็อยู่ในรูปแบบของ regression ได้เช่นกัน โดยเราจะใช้ข้อมูล PlantGrowth เปรียบเทียบน้ำหนักของต้นไม้ในสามเงื่อนไขการทดลอง

data("PlantGrowth")
str(PlantGrowth)

## 'data.frame':    30 obs. of  2 variables:
##  $ weight: num  4.17 5.58 5.18 6.11 4.5 4.61 5.17 4.53 5.33 5.14 ...
##  $ group : Factor w/ 3 levels "ctrl","trt1",..: 1 1 1 1 1 1 1 1 1 1 ...

Manual dummy coding

การเข้ารหัส dummy มีลักษณะเป็น binary คือ 0 หรือ 1 เท่านั้น การเข้ารหัสตัวเแปรจัดประเภทที่มีมากกว่า 2 เงื่อนไขจึงต้องใช้ ตัวแปร dummy เพิ่ม โดยตัวแปร dummy ที่ต้องใช้ คือ $k -1$ หรือเท่ากับ df_btw นั่นเอง

ในกรณีที่มีสามระดับ เราจะต้องการตัวแปร dummy 2 ตัว โดยตัวแรกจะใช้แทนค่ากลุ่ม trt1 โดยเข้ารหัสให้กลุ่มตัวอย่างที่อยู่ในกลุ่ม tr1tมีค่าตัวแปร dummy นี้ = 1 แต่ถ้าอยู่กลุ่มอื่นให้เป็น 0

ตัวแปร dummy ตัวที่สองใช้แทนค่า trt2 หากกลุ่มตัวอย่างอยู่เงื่อนไขนี้ จะแทนค่าด้วย 1 แต่ถ้าอยู่กลุ่มอื่นจะแทนค่าด้วย 0

เนื่องจากตัวแปรกลุ่มนี้มี 3 ระดับจึงมี degrees of freedom = 2 กล่าว คือ หากไม่อยู่กลุ่ม trt1 หรือ trt2 ก็แสดงว่าต้องอยู่ ctrl เท่านั้น ดังนั้นกลุ่มตัวอย่างในเงื่อนไขอ้างอิงนี้ จะมีค่าตัวแปร dummy ทั้งสองตัวเป็น 0

PlantGrowth$dummy_trt1 <- dplyr::recode(PlantGrowth$group, 'ctrl' = 0, 'trt1' = 1, 'trt2' = 0)
PlantGrowth$dummy_trt2 <- dplyr::recode(PlantGrowth$group, 'ctrl' = 0, 'trt1' = 0, 'trt2' = 1)

Regression

ในสมการถดถอย ตัวแปร dummy ทั้งสองตัวจะถูกใส่เป็นตัวแปรทำนาย

plant.lm1 <- lm(weight ~ dummy_trt1 + dummy_trt2, data = PlantGrowth) 
summary(plant.lm1)

## 
## Call:
## lm(formula = weight ~ dummy_trt1 + dummy_trt2, data = PlantGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0710 -0.4180 -0.0060  0.2627  1.3690 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0320     0.1971  25.527   <2e-16 ***
## dummy_trt1   -0.3710     0.2788  -1.331   0.1944    
## dummy_trt2    0.4940     0.2788   1.772   0.0877 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6234 on 27 degrees of freedom
## Multiple R-squared:  0.2641, Adjusted R-squared:  0.2096 
## F-statistic: 4.846 on 2 and 27 DF,  p-value: 0.01591

ค่าสถิติ F คือค่าเดียวกับการวิเคราะห์ ANOVA ตามปกติ

ค่า intercept คือ ค่าเฉลี่ยของกลุ่มอ้างอิง ctrl

ค่า dummy_trt1 คือ ความแตกต่างของค่าเฉลี่ยระหว่าง trt1 กับกลุ่มอ้างอิง ctrl

ค่า dummy_trt2 คือ ความแตกต่างของค่าเฉลี่ยระหว่าง trt2 กับกลุ่มอ้างอิง ctrl

Default coding

หากเราปล่อยให้ R ทำการ code รหัสให้ โดยส่งตัวแปร factor group ลงไปในสมการเลย R จะใช้ระดับแรกในข้อมูลเป็นกลุ่มอ้างอิง ctrl

ในกรณีนี้ค่าที่ได้จึงเหมือนกับการวิเคราะห์ด้านบนทุกประการ

plant.lm2 <- lm(weight ~ group, data = PlantGrowth) # must be factor
summary(plant.lm2)

## 
## Call:
## lm(formula = weight ~ group, data = PlantGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0710 -0.4180 -0.0060  0.2627  1.3690 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.0730     0.1138  44.573   <2e-16 ***
## group1       -0.0410     0.1610  -0.255   0.8009    
## group2       -0.4120     0.1610  -2.560   0.0164 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6234 on 27 degrees of freedom
## Multiple R-squared:  0.2641, Adjusted R-squared:  0.2096 
## F-statistic: 4.846 on 2 and 27 DF,  p-value: 0.01591

เนื่องจาก ANOVA นั้นกรณีพิเศษของ linear model เราสามารถให้ R วิเคราะห์โมเดลจาก lm() ในแบบ ANOVA ได้ด้วยคำสั่ง anova()

(ในการทำงานของ R เมื่อเราสร้างโมเดลด้วย aov() R จะใช้คำสั่ง lm() ในเบื้องหลังการวิเคราะห์ จึงเรียกได้ว่า aov() เป็นกรณีเฉพาะของ lm())

anova(plant.lm2)

## Analysis of Variance Table
## 
## Response: weight
##           Df  Sum Sq Mean Sq F value  Pr(>F)  
## group      2  3.7663  1.8832  4.8461 0.01591 *
## Residuals 27 10.4921  0.3886                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

emmeans(plant.lm2, ~ group)

##  group emmean    SE df lower.CL upper.CL
##  ctrl    5.03 0.197 27     4.63     5.44
##  trt1    4.66 0.197 27     4.26     5.07
##  trt2    5.53 0.197 27     5.12     5.93
## 
## Confidence level used: 0.95

pairs(emmeans(plant.lm2, ~ group)) # notice that two multiple comparisons were the same as the slopes from regression analysis

##  contrast    estimate    SE df t.ratio p.value
##  ctrl - trt1    0.371 0.279 27   1.331  0.3909
##  ctrl - trt2   -0.494 0.279 27  -1.772  0.1980
##  trt1 - trt2   -0.865 0.279 27  -3.103  0.0120
## 
## P value adjustment: tukey method for comparing a family of 3 estimates

Lab 12: Means Comparison in Regression