R 语言中的 lm 函数
战立侃 · 2018-06-17
1.
1. R 语言层次
R 语言中一个最基本的函数是 lm
, 即线性回归模型。线性回归的计算过程不是很复杂,涉及一些线性代数知识和一些简单的微积分知识。
2. C 语言层次
3. Fortran 语言层次
4. 附录
4.1. R 语言代码
4.1.1. R 语言中 lm
函数代码
function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
{
ret.x <- x
ret.y <- y
cl <- match.call()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "weights", "na.action",
"offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE
mf[[1L]] <- quote(stats::model.frame)
mf <- eval(mf, parent.frame())
if (method == "model.frame")
return(mf)
else if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
mt <- attr(mf, "terms")
y <- model.response(mf, "numeric")
w <- as.vector(model.weights(mf))
if (!is.null(w) && !is.numeric(w))
stop("'weights' must be a numeric vector")
offset <- as.vector(model.offset(mf))
if (!is.null(offset)) {
if (length(offset) != NROW(y))
stop(gettextf("number of offsets is %d, should equal %d (number of observations)",
length(offset), NROW(y)), domain = NA)
}
if (is.empty.model(mt)) {
x <- NULL
z <- list(coefficients = if (is.matrix(y)) matrix(, 0,
3) else numeric(), residuals = y, fitted.values = 0 *
y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w !=
0) else if (is.matrix(y)) nrow(y) else length(y))
if (!is.null(offset)) {
z$fitted.values <- offset
z$residuals <- y - offset
}
}
else {
x <- model.matrix(mt, mf, contrasts)
z <- if (is.null(w))
lm.fit(x, y, offset = offset, singular.ok = singular.ok,
...)
else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok,
...)
}
class(z) <- c(if (is.matrix(y)) "mlm", "lm")
z$na.action <- attr(mf, "na.action")
z$offset <- offset
z$contrasts <- attr(x, "contrasts")
z$xlevels <- .getXlevels(mt, mf)
z$call <- cl
z$terms <- mt
if (model)
z$model <- mf
if (ret.x)
z$x <- x
if (ret.y)
z$y <- y
if (!qr)
z$qr <- NULL
z
}
<bytecode: 0x7fd841700050>
<environment: namespace:stats>
4.1.2. R 语言 lm.fit
函数代码
function (x, y, offset = NULL, method = "qr", tol = 1e-07, singular.ok = TRUE,
...)
{
if (is.null(n <- nrow(x)))
stop("'x' must be a matrix")
if (n == 0L)
stop("0 (non-NA) cases")
p <- ncol(x)
if (p == 0L) {
return(list(coefficients = numeric(), residuals = y,
fitted.values = 0 * y, rank = 0, df.residual = length(y)))
}
ny <- NCOL(y)
if (is.matrix(y) && ny == 1)
y <- drop(y)
if (!is.null(offset))
y <- y - offset
if (NROW(y) != n)
stop("incompatible dimensions")
if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
chkDots(...)
z <- .Call(C_Cdqrls, x, y, tol, FALSE)
if (!singular.ok && z$rank < p)
stop("singular fit encountered")
coef <- z$coefficients
pivot <- z$pivot
r1 <- seq_len(z$rank)
dn <- colnames(x)
if (is.null(dn))
dn <- paste0("x", 1L:p)
nmeffects <- c(dn[pivot[r1]], rep.int("", n - z$rank))
r2 <- if (z$rank < p)
(z$rank + 1L):p
else integer()
if (is.matrix(y)) {
coef[r2, ] <- NA
if (z$pivoted)
coef[pivot, ] <- coef
dimnames(coef) <- list(dn, colnames(y))
dimnames(z$effects) <- list(nmeffects, colnames(y))
}
else {
coef[r2] <- NA
if (z$pivoted)
coef[pivot] <- coef
names(coef) <- dn
names(z$effects) <- nmeffects
}
z$coefficients <- coef
r1 <- y - z$residuals
if (!is.null(offset))
r1 <- r1 + offset
if (z$pivoted)
colnames(z$qr) <- colnames(x)[z$pivot]
qr <- z[c("qr", "qraux", "pivot", "tol", "rank")]
c(z[c("coefficients", "residuals", "effects", "rank")], list(fitted.values = r1,
assign = attr(x, "assign"), qr = structure(qr, class = "qr"),
df.residual = n - z$rank))
}
<bytecode: 0x7fd841c4bb10>
<environment: namespace:stats>
4.1.3. R 语言 lm.wfit
函数代码
function (x, y, w, offset = NULL, method = "qr", tol = 1e-07,
singular.ok = TRUE, ...)
{
if (is.null(n <- nrow(x)))
stop("'x' must be a matrix")
if (n == 0)
stop("0 (non-NA) cases")
ny <- NCOL(y)
if (is.matrix(y) && ny == 1L)
y <- drop(y)
if (!is.null(offset))
y <- y - offset
if (NROW(y) != n | length(w) != n)
stop("incompatible dimensions")
if (any(w < 0 | is.na(w)))
stop("missing or negative weights not allowed")
if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
chkDots(...)
x.asgn <- attr(x, "assign")
zero.weights <- any(w == 0)
if (zero.weights) {
save.r <- y
save.f <- y
save.w <- w
ok <- w != 0
nok <- !ok
w <- w[ok]
x0 <- x[!ok, , drop = FALSE]
x <- x[ok, , drop = FALSE]
n <- nrow(x)
y0 <- if (ny > 1L)
y[!ok, , drop = FALSE]
else y[!ok]
y <- if (ny > 1L)
y[ok, , drop = FALSE]
else y[ok]
}
p <- ncol(x)
if (p == 0) {
return(list(coefficients = numeric(), residuals = y,
fitted.values = 0 * y, weights = w, rank = 0L, df.residual = length(y)))
}
if (n == 0) {
return(list(coefficients = rep(NA_real_, p), residuals = y,
fitted.values = 0 * y, weights = w, rank = 0L, df.residual = 0L))
}
wts <- sqrt(w)
z <- .Call(C_Cdqrls, x * wts, y * wts, tol, FALSE)
if (!singular.ok && z$rank < p)
stop("singular fit encountered")
coef <- z$coefficients
pivot <- z$pivot
r1 <- seq_len(z$rank)
dn <- colnames(x)
if (is.null(dn))
dn <- paste0("x", 1L:p)
nmeffects <- c(dn[pivot[r1]], rep.int("", n - z$rank))
r2 <- if (z$rank < p)
(z$rank + 1L):p
else integer()
if (is.matrix(y)) {
coef[r2, ] <- NA
if (z$pivoted)
coef[pivot, ] <- coef
dimnames(coef) <- list(dn, colnames(y))
dimnames(z$effects) <- list(nmeffects, colnames(y))
}
else {
coef[r2] <- NA
if (z$pivoted)
coef[pivot] <- coef
names(coef) <- dn
names(z$effects) <- nmeffects
}
z$coefficients <- coef
z$residuals <- z$residuals/wts
z$fitted.values <- y - z$residuals
z$weights <- w
if (zero.weights) {
coef[is.na(coef)] <- 0
f0 <- x0 %*% coef
if (ny > 1) {
save.r[ok, ] <- z$residuals
save.r[nok, ] <- y0 - f0
save.f[ok, ] <- z$fitted.values
save.f[nok, ] <- f0
}
else {
save.r[ok] <- z$residuals
save.r[nok] <- y0 - f0
save.f[ok] <- z$fitted.values
save.f[nok] <- f0
}
z$residuals <- save.r
z$fitted.values <- save.f
z$weights <- save.w
}
if (!is.null(offset))
z$fitted.values <- z$fitted.values + offset
if (z$pivoted)
colnames(z$qr) <- colnames(x)[z$pivot]
qr <- z[c("qr", "qraux", "pivot", "tol", "rank")]
c(z[c("coefficients", "residuals", "fitted.values", "effects",
"weights", "rank")], list(assign = x.asgn, qr = structure(qr,
class = "qr"), df.residual = n - z$rank))
}
<bytecode: 0x7fd83eae2a28>
<environment: namespace:stats>
4.3. C 语言 Cdqrls
函数代码
/* R : A Computer Language for Statistical Data Analysis
*
* Copyright (C) 2012-2013 The R Core Team
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, a copy is available at
* https://www.R-project.org/Licenses/.
*/
#include <R.h>
#include <Rinternals.h>
#include <R_ext/Applic.h>
#include "statsR.h"
#ifdef ENABLE_NLS
#include <libintl.h>
#define _(String) dgettext ("stats", String)
#else
#define _(String) (String)
#endif
/* A wrapper to replace
z <- .Fortran("dqrls",
qr = x, n = n, p = p,
y = y, ny = ny,
tol = as.double(tol),
coefficients = mat.or.vec(p, ny),
residuals = y, effects = y, rank = integer(1L),
pivot = 1L:p, qraux = double(p), work = double(2*p),
PACKAGE="base")
with less allocation.
*/
SEXP Cdqrls(SEXP x, SEXP y, SEXP tol, SEXP chk)
{
SEXP ans;
SEXP qr, coefficients, residuals, effects, pivot, qraux;
int n, ny = 0, p, rank, nprotect = 4, pivoted = 0;
double rtol = asReal(tol), *work;
Rboolean check = asLogical(chk);
ans = getAttrib(x, R_DimSymbol);
if(check && length(ans) != 2) error(_("'x' is not a matrix"));
int *dims = INTEGER(ans);
n = dims[0]; p = dims[1];
if(n) ny = (int)(XLENGTH(y)/n); /* y : n x ny, or an n - vector */
if(check && n * ny != XLENGTH(y))
error(_("dimensions of 'x' (%d,%d) and 'y' (%d) do not match"),
n,p, XLENGTH(y));
/* These lose attributes, so do after we have extracted dims */
if (TYPEOF(x) != REALSXP) {
PROTECT(x = coerceVector(x, REALSXP));
nprotect++;
}
if (TYPEOF(y) != REALSXP) {
PROTECT(y = coerceVector(y, REALSXP));
nprotect++;
}
double *rptr = REAL(x);
for (R_xlen_t i = 0 ; i < XLENGTH(x) ; i++)
if(!R_FINITE(rptr[i])) error(_("NA/NaN/Inf in '%s'"), "x");
rptr = REAL(y);
for (R_xlen_t i = 0 ; i < XLENGTH(y) ; i++)
if(!R_FINITE(rptr[i])) error(_("NA/NaN/Inf in '%s'"), "y");
const char *ansNms[] = {"qr", "coefficients", "residuals", "effects",
"rank", "pivot", "qraux", "tol", "pivoted", ""};
PROTECT(ans = mkNamed(VECSXP, ansNms));
SET_VECTOR_ELT(ans, 0, qr = shallow_duplicate(x));
coefficients = (ny > 1) ? allocMatrix(REALSXP, p, ny) : allocVector(REALSXP, p);
PROTECT(coefficients);
SET_VECTOR_ELT(ans, 1, coefficients);
SET_VECTOR_ELT(ans, 2, residuals = shallow_duplicate(y));
SET_VECTOR_ELT(ans, 3, effects = shallow_duplicate(y));
PROTECT(pivot = allocVector(INTSXP, p));
int *ip = INTEGER(pivot);
for(int i = 0; i < p; i++) ip[i] = i+1;
SET_VECTOR_ELT(ans, 5, pivot);
PROTECT(qraux = allocVector(REALSXP, p));
SET_VECTOR_ELT(ans, 6, qraux);
SET_VECTOR_ELT(ans, 7, tol);
work = (double *) R_alloc(2 * p, sizeof(double));
F77_CALL(dqrls)(REAL(qr), &n, &p, REAL(y), &ny, &rtol,
REAL(coefficients), REAL(residuals), REAL(effects),
&rank, INTEGER(pivot), REAL(qraux), work);
SET_VECTOR_ELT(ans, 4, ScalarInteger(rank));
for(int i = 0; i < p; i++)
if(ip[i] != i+1) { pivoted = 1; break; }
SET_VECTOR_ELT(ans, 8, ScalarLogical(pivoted));
UNPROTECT(nprotect);
return ans;
}
4.4. Fortran 语言 dqrls
函数代码
c
c dqrfit is a subroutine to compute least squares solutions
c to the system
c
c (1) x * b = y
c
c which may be either under-determined or over-determined.
c the user must supply a tolerance to limit the columns of
c x used in computing the solution. in effect, a set of
c columns with a condition number approximately bounded by
c 1/tol is used, the other components of b being set to zero.
c
c on entry
c
c x double precision(n,p).
c x contains n-by-p coefficient matrix of
c the system (1), x is destroyed by dqrfit.
c
c n the number of rows of the matrix x.
c
c p the number of columns of the matrix x.
c
c y double precision(n,ny)
c y contains the right hand side(s) of the system (1).
c
c ny the number of right hand sides of the system (1).
c
c tol double precision
c tol is the nonnegative tolerance used to
c determine the subset of columns of x included
c in the solution. columns are pivoted out of
c decomposition if
c
c jpvt integer(p)
c the values in jpvt are permuted in the same
c way as the columns of x. this can be useful
c in unscrambling coefficients etc.
c
c work double precision(2*p)
c work is an array used by dqrdc2 and dqrsl.
c
c on return
c
c x contains the output array from dqrdc2.
c namely the qr decomposition of x stored in
c compact form.
c
c b double precision(p,ny)
c b contains the solution vectors with rows permuted
c in the same way as the columns of x. components
c corresponding to columns not used are set to zero.
c
c rsd double precision(n,ny)
c rsd contains the residual vectors y-x*b.
c
c qty double precision(n,ny) t
c qty contains the vectors q y. note that
c the initial p elements of this vector are
c permuted in the same way as the columns of x.
c
c k integer
c k contains the number of columns used in the
c solution.
c
c jpvt has its contents permuted as described above.
c
c qraux double precision(p)
c qraux contains auxiliary information on the
c qr decomposition of x.
c
c
c on return the arrays x, jpvt and qraux contain the
c usual output from dqrdc, so that the qr decomposition
c of x with pivoting is fully available to the user.
c in particular, columns jpvt(1), jpvt(2),...,jpvt(k)
c were used in the solution, and the condition number
c associated with those columns is estimated by
c abs(x(1,1)/x(k,k)).
c
c dqrfit uses the linpack routines dqrdc and dqrsl.
c
subroutine dqrls(x,n,p,y,ny,tol,b,rsd,qty,k,jpvt,qraux,work)
integer n,p,ny,k,jpvt(p)
double precision x(n,p),y(n,ny),tol,b(p,ny),rsd(n,ny),
. qty(n,ny),qraux(p),work(p)
c
c internal variables.
c
integer info,j,jj,kk
c
c reduce x.
c
call dqrdc2(x,n,n,p,tol,k,qraux,jpvt,work)
c
c solve the truncated least squares problem for each rhs.
c
if(k .gt. 0) then
do 20 jj=1,ny
call dqrsl(x,n,n,k,qraux,y(1,jj),rsd(1,jj),qty(1,jj),
1 b(1,jj),rsd(1,jj),rsd(1,jj),1110,info)
20 continue
else
do 35 i=1,n
do 30 jj=1,ny
rsd(i,jj) = y(i,jj)
30 continue
35 continue
endif
c
c set the unused components of b to zero.
c
kk = k + 1
do 50 j=kk,p
do 40 jj=1,ny
b(j,jj) = 0.d0
40 continue
50 continue
return
end