高斯线性模型
Likan Zhan · 2018-12-18
简介
高斯线性模型
最小方差法
矩阵分解
- QR 分解
(lm1 <- lm(optden ~ carb, data = Formaldehyde))
(X <- model.matrix(lm1))
(class(qrlm1 <- lm1$qr))
(R <- qr.R(qrlm1))
(Q1 <- qr.Q(qrlm1))
(Q1R <- Q1 %*% R)
all.equal(X, Q1R, check.attributes = FALSE)
all(X == Q1R)
(y <- model.response(model.frame(lm1)))
(Q <- qr.Q(qrlm1, complete = TRUE))
qty1 <- lm1$effects
qty2 <- crossprod(Q, y)
qty3 <- qr.qty(qrlm1, y)
all.equal(cbind(qty1), qty2, check.attributes = FALSE)
all.equal(qty1, qty3, check.attributes = FALSE)
##
## Call:
## lm(formula = optden ~ carb, data = Formaldehyde)
##
## Coefficients:
## (Intercept) carb
## 0.005086 0.876286
##
## (Intercept) carb
## 1 1 0.1
## 2 1 0.3
## 3 1 0.5
## 4 1 0.6
## 5 1 0.7
## 6 1 0.9
## attr(,"assign")
## [1] 0 1
## [1] "qr"
## (Intercept) carb
## 1 -2.44949 -1.2655697
## 2 0.00000 0.6390097
## [,1] [,2]
## [1,] -0.4082483 -0.65205066
## [2,] -0.4082483 -0.33906635
## [3,] -0.4082483 -0.02608203
## [4,] -0.4082483 0.13041013
## [5,] -0.4082483 0.28690229
## [6,] -0.4082483 0.59988661
## (Intercept) carb
## [1,] 1 0.1
## [2,] 1 0.3
## [3,] 1 0.5
## [4,] 1 0.6
## [5,] 1 0.7
## [6,] 1 0.9
## [1] TRUE
## [1] FALSE
## 1 2 3 4 5 6
## 0.086 0.269 0.446 0.538 0.626 0.782
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.4082483 -0.65205066 -0.37370452 -0.3405290 -0.3073534 -0.2410023
## [2,] -0.4082483 -0.33906635 0.05460995 0.2207196 0.3868293 0.7190487
## [3,] -0.4082483 -0.02608203 0.86857638 -0.1439791 -0.1565346 -0.1816455
## [4,] -0.4082483 0.13041013 -0.15359661 0.8125532 -0.2212971 -0.2889976
## [5,] -0.4082483 0.28690229 -0.17576960 -0.2309146 0.7139404 -0.3963496
## [6,] -0.4082483 0.59988661 -0.22011559 -0.3178501 -0.4155847 0.3889463
## [1] TRUE
## [1] TRUE
- Q 的正交性
crsQ1 <- crossprod(Q1)
crsQ <- crossprod(Q)
crsQt <- tcrossprod(Q)
all.equal(crsQ1, diag(nrow = ncol(Q1)))
all.equal(crsQ, diag(nrow = ncol(Q)))
all.equal(crsQt, diag(nrow = ncol(Q)))
## [1] TRUE
## [1] TRUE
## [1] TRUE
查看 Q 的内容
crsQ1
crsQ
## [,1] [,2]
## [1,] 1.000000e+00 -2.775558e-17
## [2,] -2.775558e-17 1.000000e+00
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1.000000e+00 -2.775558e-17 8.326673e-17 1.110223e-16 2.775558e-17
## [2,] -2.775558e-17 1.000000e+00 0.000000e+00 0.000000e+00 -2.775558e-17
## [3,] 8.326673e-17 0.000000e+00 1.000000e+00 2.775558e-17 0.000000e+00
## [4,] 1.110223e-16 0.000000e+00 2.775558e-17 1.000000e+00 2.775558e-17
## [5,] 2.775558e-17 -2.775558e-17 0.000000e+00 2.775558e-17 1.000000e+00
## [6,] 0.000000e+00 -1.110223e-16 0.000000e+00 0.000000e+00 5.551115e-17
## [,6]
## [1,] 0.000000e+00
## [2,] -1.110223e-16
## [3,] 0.000000e+00
## [4,] 0.000000e+00
## [5,] 5.551115e-17
## [6,] 1.000000e+00
用 zapsmall()
进行优化
zapsmall(crsQ1)
zapsmall(crsQ)
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1 0 0 0 0 0
## [2,] 0 1 0 0 0 0
## [3,] 0 0 1 0 0 0
## [4,] 0 0 0 1 0 0
## [5,] 0 0 0 0 1 0
## [6,] 0 0 0 0 0 1
回归系数的计算
coef1 <- backsolve(R, crossprod(Q1, y))
coef2 <- qr.coef(qrlm1, y)
coef3 <- coef(lm1)
all.equal(coef3, as.vector(coef1), check.attributes = FALSE)
## [1] TRUE
列的重新组合,例如
set.seed(1234)
badDat <- within(data.frame(
x1 = 1:20,
x2 = rnorm(20, mean = 6, sd = 0.2),
x4 = rexp(20, rate = 0.02),
y = runif(20, min = 18, max = 24)),
{x3 = x1 + 2 * x2} # 一个存在完美共线性的回归因子
)
badDat
## x1 x2 x4 y x3
## 1 1 5.758587 5.3333591 18.72888 12.51717
## 2 2 6.055486 14.6406094 23.35702 14.11097
## 3 3 6.216888 47.0222204 18.08776 15.43378
## 4 4 5.530860 12.1819198 22.69873 15.06172
## 5 5 6.085825 50.6113942 18.53977 17.17165
## 6 6 6.101211 0.1997473 21.11514 18.20242
## 7 7 5.885052 17.7094527 20.30560 18.77010
## 8 8 5.890674 37.2143748 18.42031 19.78135
## 9 9 5.887110 46.5946794 19.92387 20.77422
## 10 10 5.821992 21.7271743 22.01097 21.64398
## 11 11 5.904561 0.4545912 23.55840 22.80912
## 12 12 5.800323 80.5143016 20.83146 23.60065
## 13 13 5.844749 0.3933488 18.85569 24.68950
## 14 14 6.012892 40.7102607 21.26562 26.02578
## 15 15 6.191899 1.4506626 19.17705 27.38380
## 16 16 5.977943 216.2281546 23.39148 27.95589
## 17 17 5.897798 46.2763105 20.33700 28.79560
## 18 18 5.817761 0.8547566 19.86522 29.63552
## 19 19 5.832566 171.2640271 18.96017 30.66513
## 20 20 6.483167 6.4569840 23.37712 32.96633
建构回归模型
lm2 <- lm(y ~ x1 + x2 + x3 + x4, data = badDat)
summary(lm2)
##
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4, data = badDat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3444 -1.7670 -0.3585 1.6159 3.0292
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.793e+01 1.390e+01 1.290 0.215
## x1 4.140e-02 8.553e-02 0.484 0.635
## x2 3.822e-01 2.358e+00 0.162 0.873
## x3 NA NA NA NA
## x4 3.901e-05 8.680e-03 0.004 0.996
##
## Residual standard error: 2.02 on 16 degrees of freedom
## Multiple R-squared: 0.021, Adjusted R-squared: -0.1626
## F-statistic: 0.1144 on 3 and 16 DF, p-value: 0.9504
查看分解结果
(lm2qr <- lm2$qr)$rank
## [1] 4
qr.R(lm2qr) # 矩阵的列进行了重新组合
## (Intercept) x1 x2 x4 x3
## 1 -4.472136 -46.95743 -26.6086150 -182.87421 -1.001747e+02
## 2 0.000000 25.78759 0.1721295 83.85993 2.613185e+01
## 3 0.000000 0.00000 -0.8668932 35.62409 -1.733786e+00
## 4 0.000000 0.00000 0.0000000 232.75139 2.417072e-15
## 5 0.000000 0.00000 0.0000000 0.00000 1.241057e-14
lm2qr$pivot # 用于排列的数组
## [1] 1 2 3 5 4