Let's assume we have the data:
> dput(d)
structure(list(y = c(10.9510128075768, 9.61076281828162, 9.71566933820043,
10.8574097780798, 11.7196272991206, 10.2700549009372, 9.57781599021236,
8.81088670514042, 9.66896702112099, 9.06017067348998, 9.74106741688122,
10.3943791682216, 9.14814290797614, 12.6491668810949, 10.1560116756651,
11.1302072674549, 7.71087602015989, 10.7410011571954, 8.68375483954844,
10.9198036776091, 13.398130155452, 12.5924714207302, 14.3242586301773,
12.2987683307531, 12.4193856957595, 11.9989278189746, 12.3318213932466,
13.9451849533731, 13.4337021495452, 14.005159217677, 12.6098813359463,
13.3763702917746, 13.2441649244865, 11.5737426576175, 14.7784292874755,
13.1344476609337, 13.7655989991579, 13.955136676909, 12.9494342985577,
12.694184580233, 13.8936737024255, 11.9527018509387, 14.9713373862242,
12.6163678937116, 14.654145302275, 14.5122126939506, 13.0829657335859,
13.5672209148515, 11.9754515204655, 13.3230065030224, 14.0436124583562,
13.0990784868972, 12.5458630908456, 12.3442181475496, 12.9640775773775,
14.0691614606768, 12.5160250696987, 12.8789898886726, 11.7058599961792,
13.4943128360149, 4.30790152011415, 4.49704100940278, 3.81470273089736,
1.13021120979739, 3.48202950412376, 3.4561356033012, 2.64659971417009,
3.17048947094798, 2.1359640458731, 3.67923077401566), x1 = c("T1",
"T1", "T1", "T1", "T1", "T1", "T1", "T1", "T1", "T1", "T1", "T1",
"T1", "T1", "T1", "T1", "T1", "T1", "T1", "T1", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2", "T2",
"T2", "T2", "T2", "T2", "T3", "T3", "T3", "T3", "T3", "T3", "T3",
"T3", "T3", "T3"), x2 = c(0.672898985346896, -0.569082185143952,
0.632549243829518, 2.36443492906985, 0.665718635268836, 1.7327500422091,
1.94658564019779, 1.00439870432637, 0.647677694450151, 0.470304490866496,
1.7395892255748, -0.0634574154828123, 1.2462108435364, 0.710500633435688,
-1.26488935648794, -0.408850456073194, 1.91601932879257, 0.808721049464801,
1.80328321613365, 2.8874744633086, 13.4738811811091, 12.677268492313,
12.3799626865667, 11.8072015735427, 13.5778917949044, 12.5962341093185,
10.8264230591286, 11.8443574651097, 10.0810901797302, 11.8047411538894,
9.40767233005401, 13.3140021671981, 11.3644569989679, 11.5700211613058,
11.830681667698, 12.6122181739891, 12.6783401772227, 12.5679519724717,
11.4274573960739, 10.6367087437217, 11.6112777556621, 12.2779141324505,
11.176918878428, 11.9311590655215, 10.8323376738702, 11.9916909857844,
12.1288554015974, 11.854124371539, 11.8360890432639, 13.7635520027849,
12.7625865124183, 13.1114310807306, 11.0767930471702, 12.164341838428,
13.1548251870973, 11.9434785754735, 9.87063935176535, 12.3448457620995,
10.0950445544145, 11.1888298468598, 4.3240043212996, 3.61563684930267,
4.09166895553536, 3.30660486151363, 2.88984123751714, 2.07568722687272,
4.59291375371919, 3.04501059812188, 2.28487159933212, 3.86522309971714
), ID = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L,
20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L,
33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 1L, 2L, 3L, 4L, 5L, 6L,
7L, 8L, 9L, 10L)), .Names = c("y", "x1", "x2", "ID"), row.names = c(NA,
-70L), class = "data.frame")
I'm interested in values of y at X1=T1...T3 adjusted for X2.
The means taken from lm():
> summary(lm(y~x1+x2,data=d))
Call:
lm(formula = y ~ x1 + x2, data = d)
Residuals:
Min 1Q Median 3Q Max
-2.39798 -0.60836 0.01702 0.73175 2.58139
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.04356 0.24784 40.524 <0.0000000000000002 ***
x1T2 2.72726 1.33580 2.042 0.0452 *
x1T3 -6.92770 0.48213 -14.369 <0.0000000000000002 ***
x2 0.03408 0.11955 0.285 0.7765
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9859 on 66 degrees of freedom
Multiple R-squared: 0.9265, Adjusted R-squared: 0.9232
F-statistic: 277.3 on 3 and 66 DF, p-value: < 0.00000000000000022
are:
> with(as.data.frame(t(coef(lm(y~x1+x2,data=d)))), data.frame(x1T1=`(Intercept)`, x1T2=`(Intercept)`+x1T2, x1T3=`(Intercept)`+x1T3))
x1T1 x1T2 x1T3
1 10.04356 12.77083 3.11586
This differs from raw means, so I assume this is corrected for X2:
> tapply(d$y, d$x1, mean)
T1 T2 T3
10.075839 13.175987 3.232031
So why the LS-Means are different from the lm()? Isn't lm() corrected for X2?
emmeans(lm(y~x1+x2,data=d), "x1")
x1 emmean SE df lower.CL upper.CL
T1 10.30 0.820 66 8.66 11.94
T2 13.03 0.542 66 11.95 14.11
T3 3.37 0.585 66 2.20 4.54
So what are the means taken from lm(), if not raw means and not LS-means? I am interested in the means of y at each time point, corrected for x2
I think I found the answer: Why are emmeans package means different than regular means?
So lm() gives me the ordinary marginal means, based on data, and EM - estimated ones, based on a model. But which one are better in my case? If you are asked to report adjusted means, would you pick those form lm or emmeans? I believe SAS users choose LS-means (emmeans), while R users lm()? Or is there any good guideline?