Likan Zhan

高斯线性模型

Likan Zhan · 2018-12-18

  1. QR 分解
(lm1 <- lm(optden ~ carb, data = Formaldehyde))
(X <- model.matrix(lm1))
(class(qrlm1 <- lm1$qr))

(R  <- qr.R(qrlm1))
(Q1 <- qr.Q(qrlm1))

(Q1R <- Q1 %*% R)

all.equal(X, Q1R, check.attributes = FALSE)
all(X == Q1R)

(y <- model.response(model.frame(lm1)))
(Q <- qr.Q(qrlm1, complete = TRUE))

qty1 <- lm1$effects
qty2 <- crossprod(Q, y)
qty3  <- qr.qty(qrlm1, y)
all.equal(cbind(qty1), qty2, check.attributes = FALSE)
all.equal(qty1, qty3, check.attributes = FALSE)
## 
## Call:
## lm(formula = optden ~ carb, data = Formaldehyde)
## 
## Coefficients:
## (Intercept)         carb  
##    0.005086     0.876286  
## 
##   (Intercept) carb
## 1           1  0.1
## 2           1  0.3
## 3           1  0.5
## 4           1  0.6
## 5           1  0.7
## 6           1  0.9
## attr(,"assign")
## [1] 0 1
## [1] "qr"
##   (Intercept)       carb
## 1    -2.44949 -1.2655697
## 2     0.00000  0.6390097
##            [,1]        [,2]
## [1,] -0.4082483 -0.65205066
## [2,] -0.4082483 -0.33906635
## [3,] -0.4082483 -0.02608203
## [4,] -0.4082483  0.13041013
## [5,] -0.4082483  0.28690229
## [6,] -0.4082483  0.59988661
##      (Intercept) carb
## [1,]           1  0.1
## [2,]           1  0.3
## [3,]           1  0.5
## [4,]           1  0.6
## [5,]           1  0.7
## [6,]           1  0.9
## [1] TRUE
## [1] FALSE
##     1     2     3     4     5     6 
## 0.086 0.269 0.446 0.538 0.626 0.782 
##            [,1]        [,2]        [,3]       [,4]       [,5]       [,6]
## [1,] -0.4082483 -0.65205066 -0.37370452 -0.3405290 -0.3073534 -0.2410023
## [2,] -0.4082483 -0.33906635  0.05460995  0.2207196  0.3868293  0.7190487
## [3,] -0.4082483 -0.02608203  0.86857638 -0.1439791 -0.1565346 -0.1816455
## [4,] -0.4082483  0.13041013 -0.15359661  0.8125532 -0.2212971 -0.2889976
## [5,] -0.4082483  0.28690229 -0.17576960 -0.2309146  0.7139404 -0.3963496
## [6,] -0.4082483  0.59988661 -0.22011559 -0.3178501 -0.4155847  0.3889463
## [1] TRUE
## [1] TRUE
  1. Q 的正交性
crsQ1 <- crossprod(Q1)
crsQ  <- crossprod(Q)
crsQt <- tcrossprod(Q)

all.equal(crsQ1, diag(nrow = ncol(Q1)))
all.equal(crsQ,  diag(nrow = ncol(Q)))
all.equal(crsQt, diag(nrow = ncol(Q)))
## [1] TRUE
## [1] TRUE
## [1] TRUE

查看 Q 的内容

crsQ1
crsQ
##               [,1]          [,2]
## [1,]  1.000000e+00 -2.775558e-17
## [2,] -2.775558e-17  1.000000e+00
##               [,1]          [,2]         [,3]         [,4]          [,5]
## [1,]  1.000000e+00 -2.775558e-17 8.326673e-17 1.110223e-16  2.775558e-17
## [2,] -2.775558e-17  1.000000e+00 0.000000e+00 0.000000e+00 -2.775558e-17
## [3,]  8.326673e-17  0.000000e+00 1.000000e+00 2.775558e-17  0.000000e+00
## [4,]  1.110223e-16  0.000000e+00 2.775558e-17 1.000000e+00  2.775558e-17
## [5,]  2.775558e-17 -2.775558e-17 0.000000e+00 2.775558e-17  1.000000e+00
## [6,]  0.000000e+00 -1.110223e-16 0.000000e+00 0.000000e+00  5.551115e-17
##               [,6]
## [1,]  0.000000e+00
## [2,] -1.110223e-16
## [3,]  0.000000e+00
## [4,]  0.000000e+00
## [5,]  5.551115e-17
## [6,]  1.000000e+00

zapsmall() 进行优化

zapsmall(crsQ1)
zapsmall(crsQ)
##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1
##      [,1] [,2] [,3] [,4] [,5] [,6]
## [1,]    1    0    0    0    0    0
## [2,]    0    1    0    0    0    0
## [3,]    0    0    1    0    0    0
## [4,]    0    0    0    1    0    0
## [5,]    0    0    0    0    1    0
## [6,]    0    0    0    0    0    1

回归系数的计算

coef1 <- backsolve(R, crossprod(Q1, y))
coef2 <- qr.coef(qrlm1, y)
coef3 <- coef(lm1)
all.equal(coef3, as.vector(coef1), check.attributes = FALSE)
## [1] TRUE

列的重新组合,例如

set.seed(1234)
badDat <- within(data.frame(
  x1 = 1:20,
  x2 = rnorm(20, mean = 6, sd = 0.2),
  x4 = rexp(20, rate = 0.02),
  y  = runif(20, min = 18, max = 24)),
  {x3 = x1 + 2 * x2} # 一个存在完美共线性的回归因子
  )
badDat
##    x1       x2          x4        y       x3
## 1   1 5.758587   5.3333591 18.72888 12.51717
## 2   2 6.055486  14.6406094 23.35702 14.11097
## 3   3 6.216888  47.0222204 18.08776 15.43378
## 4   4 5.530860  12.1819198 22.69873 15.06172
## 5   5 6.085825  50.6113942 18.53977 17.17165
## 6   6 6.101211   0.1997473 21.11514 18.20242
## 7   7 5.885052  17.7094527 20.30560 18.77010
## 8   8 5.890674  37.2143748 18.42031 19.78135
## 9   9 5.887110  46.5946794 19.92387 20.77422
## 10 10 5.821992  21.7271743 22.01097 21.64398
## 11 11 5.904561   0.4545912 23.55840 22.80912
## 12 12 5.800323  80.5143016 20.83146 23.60065
## 13 13 5.844749   0.3933488 18.85569 24.68950
## 14 14 6.012892  40.7102607 21.26562 26.02578
## 15 15 6.191899   1.4506626 19.17705 27.38380
## 16 16 5.977943 216.2281546 23.39148 27.95589
## 17 17 5.897798  46.2763105 20.33700 28.79560
## 18 18 5.817761   0.8547566 19.86522 29.63552
## 19 19 5.832566 171.2640271 18.96017 30.66513
## 20 20 6.483167   6.4569840 23.37712 32.96633

建构回归模型

lm2 <- lm(y ~ x1 + x2 + x3 + x4, data = badDat)
summary(lm2)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = badDat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3444 -1.7670 -0.3585  1.6159  3.0292 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.793e+01  1.390e+01   1.290    0.215
## x1          4.140e-02  8.553e-02   0.484    0.635
## x2          3.822e-01  2.358e+00   0.162    0.873
## x3                 NA         NA      NA       NA
## x4          3.901e-05  8.680e-03   0.004    0.996
## 
## Residual standard error: 2.02 on 16 degrees of freedom
## Multiple R-squared:  0.021,  Adjusted R-squared:  -0.1626 
## F-statistic: 0.1144 on 3 and 16 DF,  p-value: 0.9504

查看分解结果

(lm2qr <- lm2$qr)$rank
## [1] 4
qr.R(lm2qr) # 矩阵的列进行了重新组合
##   (Intercept)        x1          x2         x4            x3
## 1   -4.472136 -46.95743 -26.6086150 -182.87421 -1.001747e+02
## 2    0.000000  25.78759   0.1721295   83.85993  2.613185e+01
## 3    0.000000   0.00000  -0.8668932   35.62409 -1.733786e+00
## 4    0.000000   0.00000   0.0000000  232.75139  2.417072e-15
## 5    0.000000   0.00000   0.0000000    0.00000  1.241057e-14
lm2qr$pivot # 用于排列的数组
## [1] 1 2 3 5 4