# 1. R 语言层次

R 语言中一个最基本的函数是 lm, 即线性回归模型。线性回归的计算过程不是很复杂，涉及一些线性代数知识和一些简单的微积分知识。

# 4. 附录

## 4.1. R 语言代码

### 4.1.1. R 语言中 lm 函数代码

function (formula, data, subset, weights, na.action, method = "qr",
model = TRUE, x = FALSE, y = FALSE, qr = TRUE, singular.ok = TRUE,
contrasts = NULL, offset, ...)
{
ret.x <- x
ret.y <- y
cl <- match.call()
mf <- match.call(expand.dots = FALSE)
m <- match(c("formula", "data", "subset", "weights", "na.action",
"offset"), names(mf), 0L)
mf <- mf[c(1L, m)]
mf$drop.unused.levels <- TRUE mf[[1L]] <- quote(stats::model.frame) mf <- eval(mf, parent.frame()) if (method == "model.frame") return(mf) else if (method != "qr") warning(gettextf("method = '%s' is not supported. Using 'qr'", method), domain = NA) mt <- attr(mf, "terms") y <- model.response(mf, "numeric") w <- as.vector(model.weights(mf)) if (!is.null(w) && !is.numeric(w)) stop("'weights' must be a numeric vector") offset <- as.vector(model.offset(mf)) if (!is.null(offset)) { if (length(offset) != NROW(y)) stop(gettextf("number of offsets is %d, should equal %d (number of observations)", length(offset), NROW(y)), domain = NA) } if (is.empty.model(mt)) { x <- NULL z <- list(coefficients = if (is.matrix(y)) matrix(, 0, 3) else numeric(), residuals = y, fitted.values = 0 * y, weights = w, rank = 0L, df.residual = if (!is.null(w)) sum(w != 0) else if (is.matrix(y)) nrow(y) else length(y)) if (!is.null(offset)) { z$fitted.values <- offset
z$residuals <- y - offset } } else { x <- model.matrix(mt, mf, contrasts) z <- if (is.null(w)) lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) else lm.wfit(x, y, w, offset = offset, singular.ok = singular.ok, ...) } class(z) <- c(if (is.matrix(y)) "mlm", "lm") z$na.action <- attr(mf, "na.action")
z$offset <- offset z$contrasts <- attr(x, "contrasts")
z$xlevels <- .getXlevels(mt, mf) z$call <- cl
z$terms <- mt if (model) z$model <- mf
if (ret.x)
z$x <- x if (ret.y) z$y <- y
if (!qr)
z$qr <- NULL z } <bytecode: 0x7fd841700050> <environment: namespace:stats> ### 4.1.2. R 语言 lm.fit 函数代码 function (x, y, offset = NULL, method = "qr", tol = 1e-07, singular.ok = TRUE, ...) { if (is.null(n <- nrow(x))) stop("'x' must be a matrix") if (n == 0L) stop("0 (non-NA) cases") p <- ncol(x) if (p == 0L) { return(list(coefficients = numeric(), residuals = y, fitted.values = 0 * y, rank = 0, df.residual = length(y))) } ny <- NCOL(y) if (is.matrix(y) && ny == 1) y <- drop(y) if (!is.null(offset)) y <- y - offset if (NROW(y) != n) stop("incompatible dimensions") if (method != "qr") warning(gettextf("method = '%s' is not supported. Using 'qr'", method), domain = NA) chkDots(...) z <- .Call(C_Cdqrls, x, y, tol, FALSE) if (!singular.ok && z$rank < p)
stop("singular fit encountered")
coef <- z$coefficients pivot <- z$pivot
r1 <- seq_len(z$rank) dn <- colnames(x) if (is.null(dn)) dn <- paste0("x", 1L:p) nmeffects <- c(dn[pivot[r1]], rep.int("", n - z$rank))
r2 <- if (z$rank < p) (z$rank + 1L):p
else integer()
if (is.matrix(y)) {
coef[r2, ] <- NA
if (z$pivoted) coef[pivot, ] <- coef dimnames(coef) <- list(dn, colnames(y)) dimnames(z$effects) <- list(nmeffects, colnames(y))
}
else {
coef[r2] <- NA
if (z$pivoted) coef[pivot] <- coef names(coef) <- dn names(z$effects) <- nmeffects
}
z$coefficients <- coef r1 <- y - z$residuals
if (!is.null(offset))
r1 <- r1 + offset
if (z$pivoted) colnames(z$qr) <- colnames(x)[z$pivot] qr <- z[c("qr", "qraux", "pivot", "tol", "rank")] c(z[c("coefficients", "residuals", "effects", "rank")], list(fitted.values = r1, assign = attr(x, "assign"), qr = structure(qr, class = "qr"), df.residual = n - z$rank))
}
<bytecode: 0x7fd841c4bb10>
<environment: namespace:stats>

### 4.1.3. R 语言 lm.wfit 函数代码

function (x, y, w, offset = NULL, method = "qr", tol = 1e-07,
singular.ok = TRUE, ...)
{
if (is.null(n <- nrow(x)))
stop("'x' must be a matrix")
if (n == 0)
stop("0 (non-NA) cases")
ny <- NCOL(y)
if (is.matrix(y) && ny == 1L)
y <- drop(y)
if (!is.null(offset))
y <- y - offset
if (NROW(y) != n | length(w) != n)
stop("incompatible dimensions")
if (any(w < 0 | is.na(w)))
stop("missing or negative weights not allowed")
if (method != "qr")
warning(gettextf("method = '%s' is not supported. Using 'qr'",
method), domain = NA)
chkDots(...)
x.asgn <- attr(x, "assign")
zero.weights <- any(w == 0)
if (zero.weights) {
save.r <- y
save.f <- y
save.w <- w
ok <- w != 0
nok <- !ok
w <- w[ok]
x0 <- x[!ok, , drop = FALSE]
x <- x[ok, , drop = FALSE]
n <- nrow(x)
y0 <- if (ny > 1L)
y[!ok, , drop = FALSE]
else y[!ok]
y <- if (ny > 1L)
y[ok, , drop = FALSE]
else y[ok]
}
p <- ncol(x)
if (p == 0) {
return(list(coefficients = numeric(), residuals = y,
fitted.values = 0 * y, weights = w, rank = 0L, df.residual = length(y)))
}
if (n == 0) {
return(list(coefficients = rep(NA_real_, p), residuals = y,
fitted.values = 0 * y, weights = w, rank = 0L, df.residual = 0L))
}
wts <- sqrt(w)
z <- .Call(C_Cdqrls, x * wts, y * wts, tol, FALSE)
if (!singular.ok && z$rank < p) stop("singular fit encountered") coef <- z$coefficients
pivot <- z$pivot r1 <- seq_len(z$rank)
dn <- colnames(x)
if (is.null(dn))
dn <- paste0("x", 1L:p)
nmeffects <- c(dn[pivot[r1]], rep.int("", n - z$rank)) r2 <- if (z$rank < p)
(z$rank + 1L):p else integer() if (is.matrix(y)) { coef[r2, ] <- NA if (z$pivoted)
coef[pivot, ] <- coef
dimnames(coef) <- list(dn, colnames(y))
dimnames(z$effects) <- list(nmeffects, colnames(y)) } else { coef[r2] <- NA if (z$pivoted)
coef[pivot] <- coef
names(coef) <- dn
names(z$effects) <- nmeffects } z$coefficients <- coef
z$residuals <- z$residuals/wts
z$fitted.values <- y - z$residuals
z$weights <- w if (zero.weights) { coef[is.na(coef)] <- 0 f0 <- x0 %*% coef if (ny > 1) { save.r[ok, ] <- z$residuals
save.r[nok, ] <- y0 - f0
save.f[ok, ] <- z$fitted.values save.f[nok, ] <- f0 } else { save.r[ok] <- z$residuals
save.r[nok] <- y0 - f0
save.f[ok] <- z$fitted.values save.f[nok] <- f0 } z$residuals <- save.r
z$fitted.values <- save.f z$weights <- save.w
}
if (!is.null(offset))
z$fitted.values <- z$fitted.values + offset
if (z$pivoted) colnames(z$qr) <- colnames(x)[z$pivot] qr <- z[c("qr", "qraux", "pivot", "tol", "rank")] c(z[c("coefficients", "residuals", "fitted.values", "effects", "weights", "rank")], list(assign = x.asgn, qr = structure(qr, class = "qr"), df.residual = n - z$rank))
}
<bytecode: 0x7fd83eae2a28>
<environment: namespace:stats>

## 4.3. C 语言 Cdqrls 函数代码

/*  R : A Computer Language for Statistical Data Analysis
*
*  Copyright (C) 2012-2013  The R Core Team
*
*  This program is free software; you can redistribute it and/or modify
*  the Free Software Foundation; either version 2 of the License, or
*  (at your option) any later version.
*
*  This program is distributed in the hope that it will be useful,
*  but WITHOUT ANY WARRANTY; without even the implied warranty of
*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*  GNU General Public License for more details.
*
*  You should have received a copy of the GNU General Public License
*  along with this program; if not, a copy is available at
*/

#include <R.h>
#include <Rinternals.h>
#include <R_ext/Applic.h>

#include "statsR.h"

#ifdef ENABLE_NLS
#include <libintl.h>
#define _(String) dgettext ("stats", String)
#else
#define _(String) (String)
#endif

/* A wrapper to replace

z <- .Fortran("dqrls",
qr = x, n = n, p = p,
y = y, ny = ny,
tol = as.double(tol),
coefficients = mat.or.vec(p, ny),
residuals = y, effects = y, rank = integer(1L),
pivot = 1L:p, qraux = double(p), work = double(2*p),
PACKAGE="base")

with less allocation.
*/

SEXP Cdqrls(SEXP x, SEXP y, SEXP tol, SEXP chk)
{
SEXP ans;
SEXP qr, coefficients, residuals, effects, pivot, qraux;
int n, ny = 0, p, rank, nprotect = 4, pivoted = 0;
double rtol = asReal(tol), *work;
Rboolean check = asLogical(chk);

ans = getAttrib(x, R_DimSymbol);
if(check && length(ans) != 2) error(_("'x' is not a matrix"));
int *dims = INTEGER(ans);
n = dims[0]; p = dims[1];
if(n) ny = (int)(XLENGTH(y)/n); /* y :  n x ny, or an n - vector */
if(check && n * ny != XLENGTH(y))
error(_("dimensions of 'x' (%d,%d) and 'y' (%d) do not match"),
n,p, XLENGTH(y));

/* These lose attributes, so do after we have extracted dims */
if (TYPEOF(x) != REALSXP) {
PROTECT(x = coerceVector(x, REALSXP));
nprotect++;
}
if (TYPEOF(y) != REALSXP) {
PROTECT(y = coerceVector(y, REALSXP));
nprotect++;
}

double *rptr = REAL(x);
for (R_xlen_t i = 0 ; i < XLENGTH(x) ; i++)
if(!R_FINITE(rptr[i])) error(_("NA/NaN/Inf in '%s'"), "x");

rptr = REAL(y);
for (R_xlen_t i = 0 ; i < XLENGTH(y) ; i++)
if(!R_FINITE(rptr[i])) error(_("NA/NaN/Inf in '%s'"), "y");

const char *ansNms[] = {"qr", "coefficients", "residuals", "effects",
"rank", "pivot", "qraux", "tol", "pivoted", ""};
PROTECT(ans = mkNamed(VECSXP, ansNms));
SET_VECTOR_ELT(ans, 0, qr = shallow_duplicate(x));
coefficients = (ny > 1) ? allocMatrix(REALSXP, p, ny) : allocVector(REALSXP, p);
PROTECT(coefficients);
SET_VECTOR_ELT(ans, 1, coefficients);
SET_VECTOR_ELT(ans, 2, residuals = shallow_duplicate(y));
SET_VECTOR_ELT(ans, 3, effects = shallow_duplicate(y));
PROTECT(pivot = allocVector(INTSXP, p));
int *ip = INTEGER(pivot);
for(int i = 0; i < p; i++) ip[i] = i+1;
SET_VECTOR_ELT(ans, 5, pivot);
PROTECT(qraux = allocVector(REALSXP, p));
SET_VECTOR_ELT(ans, 6, qraux);
SET_VECTOR_ELT(ans, 7, tol);

work = (double *) R_alloc(2 * p, sizeof(double));
F77_CALL(dqrls)(REAL(qr), &n, &p, REAL(y), &ny, &rtol,
REAL(coefficients), REAL(residuals), REAL(effects),
&rank, INTEGER(pivot), REAL(qraux), work);
SET_VECTOR_ELT(ans, 4, ScalarInteger(rank));
for(int i = 0; i < p; i++)
if(ip[i] != i+1) { pivoted = 1; break; }
SET_VECTOR_ELT(ans, 8, ScalarLogical(pivoted));
UNPROTECT(nprotect);

return ans;
}

## 4.4. Fortran 语言 dqrls 函数代码

c
c     dqrfit is a subroutine to compute least squares solutions
c     to the system
c
c     (1)               x * b = y
c
c     which may be either under-determined or over-determined.
c     the user must supply a tolerance to limit the columns of
c     x used in computing the solution.  in effect, a set of
c     columns with a condition number approximately bounded by
c     1/tol is used, the other components of b being set to zero.
c
c     on entry
c
c        x      double precision(n,p).
c               x contains n-by-p coefficient matrix of
c               the system (1), x is destroyed by dqrfit.
c
c        n      the number of rows of the matrix x.
c
c        p      the number of columns of the matrix x.
c
c        y      double precision(n,ny)
c               y contains the right hand side(s) of the system (1).
c
c        ny     the number of right hand sides of the system (1).
c
c        tol    double precision
c               tol is the nonnegative tolerance used to
c               determine the subset of columns of x included
c               in the solution.  columns are pivoted out of
c               decomposition if
c
c        jpvt   integer(p)
c               the values in jpvt are permuted in the same
c               way as the columns of x.  this can be useful
c               in unscrambling coefficients etc.
c
c        work   double precision(2*p)
c               work is an array used by dqrdc2 and dqrsl.
c
c     on return
c
c        x      contains the output array from dqrdc2.
c               namely the qr decomposition of x stored in
c               compact form.
c
c        b      double precision(p,ny)
c               b contains the solution vectors with rows permuted
c               in the same way as the columns of x.  components
c               corresponding to columns not used are set to zero.
c
c        rsd    double precision(n,ny)
c               rsd contains the residual vectors y-x*b.
c
c        qty    double precision(n,ny)     t
c               qty contains the vectors  q y.   note that
c               the initial p elements of this vector are
c               permuted in the same way as the columns of x.
c
c        k      integer
c               k contains the number of columns used in the
c               solution.
c
c        jpvt   has its contents permuted as described above.
c
c        qraux  double precision(p)
c               qraux contains auxiliary information on the
c               qr decomposition of x.
c
c
c     on return the arrays x, jpvt and qraux contain the
c     usual output from dqrdc, so that the qr decomposition
c     of x with pivoting is fully available to the user.
c     in particular, columns jpvt(1), jpvt(2),...,jpvt(k)
c     were used in the solution, and the condition number
c     associated with those columns is estimated by
c     abs(x(1,1)/x(k,k)).
c
c     dqrfit uses the linpack routines dqrdc and dqrsl.
c
subroutine dqrls(x,n,p,y,ny,tol,b,rsd,qty,k,jpvt,qraux,work)
integer n,p,ny,k,jpvt(p)
double precision x(n,p),y(n,ny),tol,b(p,ny),rsd(n,ny),
.                 qty(n,ny),qraux(p),work(p)
c
c     internal variables.
c
integer info,j,jj,kk
c
c     reduce x.
c
call dqrdc2(x,n,n,p,tol,k,qraux,jpvt,work)
c
c     solve the truncated least squares problem for each rhs.
c
if(k .gt. 0) then
do 20 jj=1,ny
call dqrsl(x,n,n,k,qraux,y(1,jj),rsd(1,jj),qty(1,jj),
1           b(1,jj),rsd(1,jj),rsd(1,jj),1110,info)
20       continue
else
do 35 i=1,n
do 30 jj=1,ny
rsd(i,jj) = y(i,jj)
30       continue
35   continue
endif
c
c     set the unused components of b to zero.
c
kk = k + 1
do 50 j=kk,p
do 40 jj=1,ny
b(j,jj) = 0.d0
40    continue
50 continue
return
end