library(magrittr)

rm(list=ls())

第10章：函数式编程

函数式编程：创建和操作函数的工具。

适用于向量的所有操作也都适用于函数：

将函数赋值给变量；
将函数存储在列表；
函数作为参数传递给其他函数；
函数内创建函数；
把函数作为函数的结果返回；

10.1 动机

DRY: Do Not Repeat Yourself

函数组合

将两个简单函数组合起来；

lapply以函数作为输入，所以它是泛函functional。

df <- data.frame(replicate(6,sample(c(1:10,-99),6,replace = T)))
df

##    X1 X2 X3 X4 X5  X6
## 1   2  4  3 10  2   9
## 2   4  1  2  6  6   3
## 3   3 10  9  4 10  10
## 4   1  2  1  8  7   6
## 5 -99  6  7  4  9 -99
## 6   6  5  2  9 10   8

fix_missing <- function(x) {
  x[x == -99] <- NA
  x
}
df[] <- lapply(df, fix_missing)
df

##   X1 X2 X3 X4 X5 X6
## 1  2  4  3 10  2  9
## 2  4  1  2  6  6  3
## 3  3 10  9  4 10 10
## 4  1  2  1  8  7  6
## 5 NA  6  7  4  9 NA
## 6  6  5  2  9 10  8

函数工厂

基于模板来创建函数

missing_fixer <- function(na_value) {
  function(x) {
    x[x == na_value] <- NA
    x
  }
}
fix_missing_99 <- missing_fixer(-99)
fix_missing_999 <- missing_fixer(-999)
fix_missing_99(c(-99,-999))

## [1]   NA -999

fix_missing_999(c(-99,-999))

## [1] -99  NA

函数列表

多个相似函数的同时使用

summary_new <- function(x) {
  funs <- c(mean, median, sd, mad, IQR)
  lapply(funs, function(f) f(x, na.rm = T))
}
sapply(mtcars, summary_new)

##      mpg      cyl      disp     hp       drat      wt        qsec    
## [1,] 20.09062 6.1875   230.7219 146.6875 3.596563  3.21725   17.84875
## [2,] 19.2     6        196.3    123      3.695     3.325     17.71   
## [3,] 6.026948 1.785922 123.9387 68.56287 0.5346787 0.9784574 1.786943
## [4,] 5.41149  2.9652   140.4764 77.0952  0.704235  0.7672455 1.415883
## [5,] 7.375    4        205.175  83.5     0.84      1.02875   2.0075  
##      vs        am        gear      carb  
## [1,] 0.4375    0.40625   3.6875    2.8125
## [2,] 0         0         4         2     
## [3,] 0.5040161 0.4989909 0.7378041 1.6152
## [4,] 0         0         1.4826    1.4826
## [5,] 1         1         1         2

10.2 匿名函数

函数本身就是对象，不会自带函数名。使用常规赋值运算符命名，得到匿名函数。

当觉得没有必要给函数命名时，使用匿名函数。

创建小函数；
创建闭包；

10.3 闭包

闭包：将父函数的环境封装并可以访问它所有变量。

两个层次的参数：

父层次控制运算；
子层次进行工作；

power <- function(exponent) {
  function(x) {
    x ^ exponent
  }
}

square <- power(2)
square(2)

## [1] 4

square(4)

## [1] 16

cube <- power(3)
cube(2)

## [1] 8

cube(4)

## [1] 64

10.4 函数列表

函数可以更容易地与一组相关函数一起运行

比较算法时间

compute_mean <- list(
  base = function(x) mean(x),
  sum = function(x) sum(x) / length(x),
  manual = function(x) {
    total <- 0
    n <- length(x)
    for(i in seq_along(x)) {
      total <- total + x[i] / n
    }
    total
  }
)

x <- runif(1e5)
system.time(compute_mean$base(x))

## 用户 系统 流逝 
##    0    0    0

system.time(compute_mean[[2]](x))

## 用户 系统 流逝 
##    0    0    0

system.time(compute_mean[["manual"]](x))

## 用户 系统 流逝 
## 0.01 0.00 0.02

lapply(compute_mean, function(f) system.time(f(x)))

## $base
## 用户 系统 流逝 
##    0    0    0 
## 
## $sum
## 用户 系统 流逝 
##    0    0    0 
## 
## $manual
## 用户 系统 流逝 
##    0    0    0

对一个对象进行多方面汇总

x <- 1:10
funs <- list(
  sum = sum,
  mean = mean,
  median = median
)
lapply(funs, function(f) f(x))

## $sum
## [1] 55
## 
## $mean
## [1] 5.5
## 
## $median
## [1] 5.5

lapply(funs, function(f) f(x, na.rm = T))

## $sum
## [1] 55
## 
## $mean
## [1] 5.5
## 
## $median
## [1] 5.5

10.5 案例研究：数值积分

函数名作为参数1

midpoint <- function(f,a,b){
  (b-a) * f((a+b)/2)
}
trapezoid <- function(f,a,b){
  (b-a)/2 * (f(a)+f(b))
}

midpoint(sin,0,pi)

## [1] 3.141593

trapezoid(sin,0,pi)

## [1] 1.923607e-16

两个函数都没有给出正确答案。

为了使答案更加准确，先将整个积分区间分成小区间，再对小区间积分。组合积分

midpoint_composite <- function(f,a,b,n=10){
  points <- seq(a,b,length = n+1)
  h <- (b-a)/n
  
  area <- 0
  for(i in seq_len(n)) {
    area <- area+ h * f((points[i]+points[i+1])/2)
  }
  area
}

trapezoid_composite <- function(f,a,b,n=10){
  points <- seq(a,b,length = n+1)
  h <- (b-a)/n
  
  area <- 0
  for(i in seq_len(n)) {
    area <- area+ h/2* (f(points[i])+f(points[i+1]))
  }
  area
}

midpoint_composite(sin,0,pi,n=10)

## [1] 2.008248

trapezoid_composite(sin,0,pi,n=10)

## [1] 1.983524

两个函数有大量相似，考虑提取更通用的组合积分函数

函数名作为参数2

composite <- function(f, a, b, n=10, rule) {
  points <- seq(a, b, length = n+1)
  
  area <- 0
  for(i in seq_len(n)){
    area <- area + rule(f, points[i], points[i+1])
  }
  area
}

composite(sin,0,pi,n=10,rule=midpoint)

## [1] 2.008248

composite(sin,0,pi,n=10,rule=trapezoid)

## [1] 1.983524

这个函数以两个函数作为参数：要积分的函数和积分规则。

现在可以在小区间内添加更好的积分规则

simpson <- function(f,a,b){
  (b-a)/6*(f(a)+4*f((a+b)/2)+f(b))
}

boole <- function(f,a,b){
  pos <- function(i) a+i*(b-a)/4
  fi <- function(i) f(pos(i))
  
  (b-a)/90*
    (7*fi(0)+32*fi(1)+12*fi(2)+32*fi(3)+7*fi(4))
}

composite(sin,0,pi,n=10,rule=simpson)

## [1] 2.000007

composite(sin,0,pi,n=10,rule=boole)

## [1] 2

更普遍的牛顿科特斯规则

函数工厂：闭包

newton_cotes <- function(coef,open=FALSE){
  n <- length(coef)+open
  
  function(f,a,b){
    pos <- function(i) a+i*(b-a)/n
    points <- pos(seq.int(0,length(coef)-1))
    
    (b-a)/sum(coef)*sum(f(points)*coef)
  }
}

milne <- newton_cotes(c(2,-1,2),open=T)
composite(sin,0,pi,n=10,rule=milne)

## [1] 1.993829

问题1：函数列表

funs <- list(
  midpoint,
  trapezoid,
  simpson,
  boole
)
lapply(funs,function(f) f(sin,0,pi))

## [[1]]
## [1] 3.141593
## 
## [[2]]
## [1] 1.923607e-16
## 
## [[3]]
## [1] 2.094395
## 
## [[4]]
## [1] 1.998571

lapply(funs, composite, f=sin, a=0, b=pi, n=10)

## [[1]]
## [1] 2.008248
## 
## [[2]]
## [1] 1.983524
## 
## [[3]]
## [1] 2.000007
## 
## [[4]]
## [1] 2

问题2：根据系数列表产生函数列表

arglist <- list(
  boole = c(7,32,12,32,7),
  milne = c(2, -1, 2)
)
arglist

## $boole
## [1]  7 32 12 32  7
## 
## $milne
## [1]  2 -1  2

funs <- lapply(arglist, newton_cotes)
funs

## $boole
## function (f, a, b) 
## {
##     pos <- function(i) a + i * (b - a)/n
##     points <- pos(seq.int(0, length(coef) - 1))
##     (b - a)/sum(coef) * sum(f(points) * coef)
## }
## <bytecode: 0x0000000008783868>
## <environment: 0x000000000ac81878>
## 
## $milne
## function (f, a, b) 
## {
##     pos <- function(i) a + i * (b - a)/n
##     points <- pos(seq.int(0, length(coef) - 1))
##     (b - a)/sum(coef) * sum(f(points) * coef)
## }
## <bytecode: 0x0000000008783868>
## <environment: 0x000000000877ff08>

lapply(funs, composite, f=sin,a=0, b=pi, n=10)

## $boole
## [1] 2.001979
## 
## $milne
## [1] 1.990848

第11章：泛函

泛函functional，以函数作为输入并返回一个向量的函数。

randomise <- function(f) f(runif(1e3))

randomise(mean)

## [1] 0.4844264

randomise(mean)

## [1] 0.5140983

randomise(sum)

## [1] 492.6028

3个最常用的泛函：lapply/apply/tapply。

循环最大的缺点：表达不够清晰，不能表达更高层次的目的。

每个泛函都是为特殊任务量身定做，认识了泛函之后也就知道了为什么使用它。

以函数的方式思考并处理函数。清晰的表达我们的想法，并创建解决问题的工具。

闭包频繁地与泛函联合使用。

11.1 lapply

lapply是常见for循环的包装器，更容易地处理列表，将注意力集中在要应用的函数上。

l <- replicate(20,runif(sample(1:10,1)),simplify = F)

unlist(lapply(l, length))

##  [1]  8  2  8  6  6  8  7  3  4  8  7 10  5  9  8  7  1  2  9  1

lapply(x,f)中，x总是作为f的第一个参数，如果想改变，可以使用匿名函数

trims <- c(0,0.1,0.2,0.5)
x <- rcauchy(1000)
unlist(lapply(trims, function(trim) mean(x,trim=trim)))

## [1]  1.08999465 -0.01621282 -0.02831401 -0.04607358

11.1.1 循环模式

有三种方法对向量循环：

对每个元素循环 for(x in xs);
根据元素的数值索引循环 for(i in seq_along(xs));
根据元素的名字进行循环 for(nm in names(xs))；

方法一存储效率低，每次对向量扩展时，R不得不复制所有的现有元素

xs <- runif(1e3)
res <- c()
for(x in xs){
  res <- c(res,sqrt(x))
}

最好先为输出创建一个空向量，再向其中添加元素。即方法二。

res <- numeric(length(xs))
for(i in seq_along(xs)){
  res[i] <- sqrt(xs[i])
}

lapply的三种循环方法：

lapply(xs, function(x) {})
lapply(seq_along(xs), function(i) {})
lapply(names(xs), function(nm) {})

通常使用第一种方法，如果需要知道元素的位置或名字，应该使用第二种或第三种方法。

11.1.2 练习

第一题

使用匿名函数

trims <- c(0,0.1,0.2,0.5)
x <- rcauchy(100)

lapply(trims,function(trim) mean(x, trim = trim))

## [[1]]
## [1] -0.07787933
## 
## [[2]]
## [1] 0.02905711
## 
## [[3]]
## [1] 0.04665869
## 
## [[4]]
## [1] 0.0435092

指定了参数名x，只能对trim参数循环。

lapply(trims,mean,x=x)

## [[1]]
## [1] -0.07787933
## 
## [[2]]
## [1] 0.02905711
## 
## [[3]]
## [1] 0.04665869
## 
## [[4]]
## [1] 0.0435092

第二题

scale01 <- function(x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1])/ (rng[2] - rng[1])
}

m <- lapply(mtcars, scale01)
as.data.frame(m) %>% head()

##         mpg cyl      disp        hp      drat        wt      qsec vs am
## 1 0.4510638 0.5 0.2217511 0.2049470 0.5253456 0.2830478 0.2333333  0  1
## 2 0.4510638 0.5 0.2217511 0.2049470 0.5253456 0.3482485 0.3000000  0  1
## 3 0.5276596 0.0 0.0920429 0.1448763 0.5023041 0.2063411 0.4892857  1  1
## 4 0.4680851 0.5 0.4662010 0.2049470 0.1474654 0.4351828 0.5880952  1  0
## 5 0.3531915 1.0 0.7206286 0.4346290 0.1797235 0.4927129 0.3000000  0  0
## 6 0.3276596 0.5 0.3838863 0.1872792 0.0000000 0.4978266 0.6809524  1  0
##   gear      carb
## 1  0.5 0.4285714
## 2  0.5 0.4285714
## 3  0.5 0.0000000
## 4  0.0 0.0000000
## 5  0.0 0.1428571
## 6  0.0 0.0000000

head(CO2)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

c <- lapply(CO2, function(x) if(is.numeric(x)) scale01(x) else x)
as.data.frame(c) %>% head()

##   Plant   Type  Treatment       conc    uptake
## 1   Qn1 Quebec nonchilled 0.00000000 0.2195767
## 2   Qn1 Quebec nonchilled 0.08839779 0.6005291
## 3   Qn1 Quebec nonchilled 0.17127072 0.7169312
## 4   Qn1 Quebec nonchilled 0.28176796 0.7804233
## 5   Qn1 Quebec nonchilled 0.44751381 0.7301587
## 6   Qn1 Quebec nonchilled 0.64088398 0.8333333

第三题

formulas <- list(
  mpg ~ disp,
  mpg ~ I(1 / disp),
  mpg ~ disp + wt,
  mpg ~ I(1 / disp) +wt
)

(res1 <- lapply(formulas, lm, mtcars))

## [[1]]
## 
## Call:
## FUN(formula = X[[i]], data = ..1)
## 
## Coefficients:
## (Intercept)         disp  
##    29.59985     -0.04122  
## 
## 
## [[2]]
## 
## Call:
## FUN(formula = X[[i]], data = ..1)
## 
## Coefficients:
## (Intercept)    I(1/disp)  
##       10.75      1557.67  
## 
## 
## [[3]]
## 
## Call:
## FUN(formula = X[[i]], data = ..1)
## 
## Coefficients:
## (Intercept)         disp           wt  
##    34.96055     -0.01772     -3.35083  
## 
## 
## [[4]]
## 
## Call:
## FUN(formula = X[[i]], data = ..1)
## 
## Coefficients:
## (Intercept)    I(1/disp)           wt  
##      19.024     1142.560       -1.798

第四题

bootstraps <- lapply(1:10, function(i) {
  rows <- sample(1:nrow(mtcars),rep=T)
  mtcars[rows,]
})

(res2 <- lapply(bootstraps, lm, formula = mpg~disp))

## [[1]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    30.22549     -0.04199  
## 
## 
## [[2]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    32.00998     -0.04744  
## 
## 
## [[3]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    28.81025     -0.03968  
## 
## 
## [[4]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    30.81237     -0.04381  
## 
## 
## [[5]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    28.10920     -0.03607  
## 
## 
## [[6]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    29.77142     -0.04229  
## 
## 
## [[7]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    29.02102     -0.04103  
## 
## 
## [[8]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##      30.985       -0.047  
## 
## 
## [[9]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##     30.6842      -0.0475  
## 
## 
## [[10]]
## 
## Call:
## FUN(formula = ..1, data = X[[i]])
## 
## Coefficients:
## (Intercept)         disp  
##    29.94205     -0.04309

第五题

rsq <- function(mod) summary(mod)$r.squared

lapply(res1,rsq)

## [[1]]
## [1] 0.7183433
## 
## [[2]]
## [1] 0.8596865
## 
## [[3]]
## [1] 0.7809306
## 
## [[4]]
## [1] 0.8838038

lapply(res2, rsq)

## [[1]]
## [1] 0.7044791
## 
## [[2]]
## [1] 0.7707638
## 
## [[3]]
## [1] 0.5978648
## 
## [[4]]
## [1] 0.6325821
## 
## [[5]]
## [1] 0.7424659
## 
## [[6]]
## [1] 0.7413616
## 
## [[7]]
## [1] 0.7956857
## 
## [[8]]
## [1] 0.785494
## 
## [[9]]
## [1] 0.6139113
## 
## [[10]]
## [1] 0.7630576

11.2 lapply的相似函数

常见的循环模式已经在现有的基本泛函中实现了。

一旦已经掌握现有的泛函，下一步就是编写自己的泛函，将代码中出现了相同的循环模式提取出来，使用泛函来替代。

11.2.1 向量输出 sapply和vapply

sapply通过猜测设定输出类型；
vapply通过附加参数来设定输出类型。

sapply(mtcars,is.numeric)

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

vapply(mtcars, is.numeric, FUN.VALUE = logical(1))

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

如果函数返回不同类型或不同长度的结果，sapply返回列表，vapply抛出错误。

sapply更适合在交互式分析中。

df <- data.frame(x=1:10,y=letters[1:10])
sapply(df, class)

##         x         y 
## "integer"  "factor"

vapply(df, class, FUN.VALUE = character(1))

##         x         y 
## "integer"  "factor"

df <- data.frame(x=1:10,y=Sys.time()+1:10)
sapply(df, class)

## $x
## [1] "integer"
## 
## $y
## [1] "POSIXct" "POSIXt"

#vapply(df, class, FUN.VALUE = character(1))

sapply是对lapply的简单包装；
vapply是将结果赋值给适当类型的向量（矩阵）；

两者与lapply的输出不同。

11.2.2 多重输入Map和mapply

lapply只有一个参数可以改变，其他参数固定。

如果要用lapply实现加权平均，看起来有些笨拙

xs <- replicate(5,runif(10),simplify = F)
ws <- replicate(5,rpois(10,5)+1,simplify = F)
lapply(seq_along(xs), function(i) {
  weighted.mean(xs[[i]], ws[[i]])
})

## [[1]]
## [1] 0.5909633
## 
## [[2]]
## [1] 0.5104818
## 
## [[3]]
## [1] 0.4697772
## 
## [[4]]
## [1] 0.3998523
## 
## [[5]]
## [1] 0.3409802

Map是lapply的变体，其中所有参数都可以改变

Map(weighted.mean,xs,ws)

## [[1]]
## [1] 0.5909633
## 
## [[2]]
## [1] 0.5104818
## 
## [[3]]
## [1] 0.4697772
## 
## [[4]]
## [1] 0.3998523
## 
## [[5]]
## [1] 0.3409802

函数是Map的第一个参数

当需要对两个列表进行同时处理时，Map很有用。

Map是simplify=FALSE的mapply，总能给出想要结果。

11.2.4 并行化

lapply的顺序可以是任意的，所以可以分配给多个CPU实现并行计算。

parallel::mclapply和parallel::mcMap （不能在windows上运行）

11.2.5 练习

第一题

vapply(mtcars,sd,numeric(1))

##         mpg         cyl        disp          hp        drat          wt 
##   6.0269481   1.7859216 123.9386938  68.5628685   0.5346787   0.9784574 
##        qsec          vs          am        gear        carb 
##   1.7869432   0.5040161   0.4989909   0.7378041   1.6152000

vapply(CO2,function(x) if(is.numeric(x)) sd(x) else NA,numeric(1))

##     Plant      Type Treatment      conc    uptake 
##        NA        NA        NA 295.92412  10.81441

第三题

trials <- replicate(
  100,
  t.test(rpois(10,10),rpois(7,10)),
  simplify = F
)

sapply(trials,function(x) x[["p.value"]])

##   [1] 0.650643251 0.153491856 0.544047667 0.908921663 0.110744868
##   [6] 0.895083844 0.217034617 0.212959030 0.368265969 0.704764482
##  [11] 0.733391414 0.691808232 0.776493427 0.842816718 0.963672637
##  [16] 0.183047665 0.653482561 0.850390177 0.811199913 0.252423907
##  [21] 0.631798644 0.441093217 0.751151984 0.235394291 0.483803662
##  [26] 0.480660933 0.041528097 0.357001731 0.161541833 0.800836078
##  [31] 0.705894815 0.678569832 0.110825635 0.580227306 0.302214486
##  [36] 0.087091388 0.836519894 0.016202557 0.569533890 0.541240680
##  [41] 0.698658596 0.212445590 0.204742195 0.531493571 0.356480695
##  [46] 0.290108237 0.555742575 0.368632904 0.337285669 0.095670439
##  [51] 0.254081244 0.378855296 0.051037009 0.306723163 0.077182628
##  [56] 0.797770515 0.857599194 0.077140840 0.920514799 0.163912430
##  [61] 0.897718466 0.143666217 0.885375975 0.794168206 0.406766517
##  [66] 0.822398592 0.488747043 0.878946703 0.182315725 0.203060059
##  [71] 0.455040038 0.436871145 0.970322414 0.019242829 0.163338035
##  [76] 0.571548256 0.377801368 0.001135937 0.782087669 0.382289087
##  [81] 0.057595977 0.555084647 0.433972765 0.195674923 0.864260609
##  [86] 0.114759543 0.386882574 0.873645937 0.526725879 0.749292688
##  [91] 0.011965459 0.077420362 0.952657664 0.263079820 0.681106119
##  [96] 0.094403630 0.670419821 0.293979046 0.774170389 0.822929043

sapply(trials,'[[',"p.value")

##   [1] 0.650643251 0.153491856 0.544047667 0.908921663 0.110744868
##   [6] 0.895083844 0.217034617 0.212959030 0.368265969 0.704764482
##  [11] 0.733391414 0.691808232 0.776493427 0.842816718 0.963672637
##  [16] 0.183047665 0.653482561 0.850390177 0.811199913 0.252423907
##  [21] 0.631798644 0.441093217 0.751151984 0.235394291 0.483803662
##  [26] 0.480660933 0.041528097 0.357001731 0.161541833 0.800836078
##  [31] 0.705894815 0.678569832 0.110825635 0.580227306 0.302214486
##  [36] 0.087091388 0.836519894 0.016202557 0.569533890 0.541240680
##  [41] 0.698658596 0.212445590 0.204742195 0.531493571 0.356480695
##  [46] 0.290108237 0.555742575 0.368632904 0.337285669 0.095670439
##  [51] 0.254081244 0.378855296 0.051037009 0.306723163 0.077182628
##  [56] 0.797770515 0.857599194 0.077140840 0.920514799 0.163912430
##  [61] 0.897718466 0.143666217 0.885375975 0.794168206 0.406766517
##  [66] 0.822398592 0.488747043 0.878946703 0.182315725 0.203060059
##  [71] 0.455040038 0.436871145 0.970322414 0.019242829 0.163338035
##  [76] 0.571548256 0.377801368 0.001135937 0.782087669 0.382289087
##  [81] 0.057595977 0.555084647 0.433972765 0.195674923 0.864260609
##  [86] 0.114759543 0.386882574 0.873645937 0.526725879 0.749292688
##  [91] 0.011965459 0.077420362 0.952657664 0.263079820 0.681106119
##  [96] 0.094403630 0.670419821 0.293979046 0.774170389 0.822929043

11.3 操作矩阵和数据框

11.3.1 矩阵和数组运算

apply是sapply的变体，用一个数来描述矩阵或数组的每一列或每一行的汇总操作。

sweep对统计汇总值扫描，下例将量纲调整到0~1之间

x <- matrix(rnorm(20,0,10),nrow = 4)
x1 <- sweep(x,1,apply(x, 1, min),`-`)
x2 <- sweep(x1,1,apply(x1, 1, max),`/`)

outer接受多个向量输入，并创建一个矩阵或数组输出，其中输入函数对输入的所有可能组合。

outer(1:3,1:10,"*")

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,]    1    2    3    4    5    6    7    8    9    10
## [2,]    2    4    6    8   10   12   14   16   18    20
## [3,]    3    6    9   12   15   18   21   24   27    30

outer(1:5,1:5,FUN="paste",sep=",")

##      [,1]  [,2]  [,3]  [,4]  [,5] 
## [1,] "1,1" "1,2" "1,3" "1,4" "1,5"
## [2,] "2,1" "2,2" "2,3" "2,4" "2,5"
## [3,] "3,1" "3,2" "3,3" "3,4" "3,5"
## [4,] "4,1" "4,2" "4,3" "4,4" "4,5"
## [5,] "5,1" "5,2" "5,3" "5,4" "5,5"

11.3.2 组应用tapply

tapply实际上是split+sapply

tapply(mtcars$mpg,mtcars$cyl,length)

##  4  6  8 
## 11  7 14

split(mtcars$mpg,mtcars$cyl)

## $`4`
##  [1] 22.8 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
## 
## $`6`
## [1] 21.0 21.0 21.4 18.1 19.2 17.8 19.7
## 
## $`8`
##  [1] 18.7 14.3 16.4 17.3 15.2 10.4 10.4 14.7 15.5 15.2 13.3 19.2 15.8 15.0

split(mtcars$mpg,mtcars$cyl) %>% 
  sapply(length)

##  4  6  8 
## 11  7 14

11.3.3 plyr包

《The Split-Apply-Combine Strategy for Data Analysis》

11.4 列表操作

11.4.1 Reduce

Reduce通过地柜调用函数f，每次有两个参数，首先使用f对x中前两个元素计算，得到结果，再使用结果和x第三个元素进行计算。

Map是对循环的简化；
Reduce是对地柜的简化；

l <- replicate(5,sample(1:10,15,replace = T),simplify = F)
str(l)

## List of 5
##  $ : int [1:15] 2 7 2 3 4 4 3 5 7 7 ...
##  $ : int [1:15] 1 9 3 1 2 5 4 7 3 4 ...
##  $ : int [1:15] 2 8 2 3 7 8 5 9 1 6 ...
##  $ : int [1:15] 9 4 5 7 3 2 2 7 3 6 ...
##  $ : int [1:15] 6 5 10 8 9 4 8 8 1 1 ...

Reduce(intersect,l)

## [1] 2 7 4 5 6

11.4.2 判断泛函

Filter 只选择满足判断条件的元素；
Find 返回满足判断条件的第一个元素；
Position 返回满足判断条件的第一个元素位置；
where 根据列表返回一个逻辑向量；

where <- function(f,x) {
  vapply(x, f, FUN.VALUE = logical(1))
}

df <- data.frame(x=1:3,y=letters[1:3])

where(is.factor,df)

##     x     y 
## FALSE  TRUE

Filter(is.factor,df)

##   y
## 1 a
## 2 b
## 3 c

Find(is.factor,df)

## [1] a b c
## Levels: a b c

Position(is.factor,df)

## [1] 2

11.4.3 练习

第一题

complete.cases(mtcars)

##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [15] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [29] TRUE TRUE TRUE TRUE

where(function(x) sum(is.na(x))==0,mtcars)

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE

第二题

numsummary <- function(df){
  f <- Filter(is.numeric,df)
  vapply(f, summary, FUN.VALUE = numeric(6))
}

numsummary(CO2)

##         conc  uptake
## Min.      95  7.7000
## 1st Qu.  175 17.9000
## Median   350 28.3000
## Mean     435 27.2131
## 3rd Qu.  675 37.1250
## Max.    1000 45.5000

第三题

head(CO2)

##   Plant   Type  Treatment conc uptake
## 1   Qn1 Quebec nonchilled   95   16.0
## 2   Qn1 Quebec nonchilled  175   30.4
## 3   Qn1 Quebec nonchilled  250   34.8
## 4   Qn1 Quebec nonchilled  350   37.2
## 5   Qn1 Quebec nonchilled  500   35.3
## 6   Qn1 Quebec nonchilled  675   39.2

Position(is.numeric,CO2)

## [1] 4

where(is.numeric,CO2) %>% which()

##   conc uptake 
##      4      5

which(c(F,F,F,T,T))

## [1] 4 5

Position是which的泛函版本

CO2[where(is.numeric,CO2)] %>% head()

##   conc uptake
## 1   95   16.0
## 2  175   30.4
## 3  250   34.8
## 4  350   37.2
## 5  500   35.3
## 6  675   39.2

Filter(is.numeric,CO2) %>% head()

##   conc uptake
## 1   95   16.0
## 2  175   30.4
## 3  250   34.8
## 4  350   37.2
## 5  500   35.3
## 6  675   39.2

where得到T,F；Filter直接选择子集

第四题

Any <- function(f,l){
  sum(where(f,l))!=0
}

Any(is.numeric,mtcars)

## [1] TRUE

Any(is.numeric,CO2)

## [1] TRUE

All <- function(f,l) {
  sum(where(f,l))==length(l)
}

All(is.numeric,mtcars)

## [1] TRUE

All(is.numeric,CO2)

## [1] FALSE

11.5 数学泛函

integrate 计算曲线下面积
uniroot 计算根集
optimise 计算最高点和最低点位置

integrate(sin,0,pi)

## 2 with absolute error < 2.2e-14

uniroot(sin,pi*c(1/2,3/2))

## $root
## [1] 3.141593
## 
## $f.root
## [1] 1.224606e-16
## 
## $iter
## [1] 2
## 
## $init.it
## [1] NA
## 
## $estim.prec
## [1] 6.103516e-05

optimise(sin,c(0,2*pi))

## $minimum
## [1] 4.712391
## 
## $objective
## [1] -1

optimise(sin,c(0,pi),maximum = T)

## $maximum
## [1] 1.570796
## 
## $objective
## [1] 1

11.6 应该保留的循环

原位修改
递归函数
while循环

第12章函数运算符

泛函：提取循环使用的通用模式；
函数运算符，提取匿名函数使用的通用模式；

四种重要类型的函数运算符：

行为；
输入；
输出；
组合；

对多个函数进行组合而不是对参数进行组合

12.1 行为函数运算符

行为函数运算符不会改变函数的输入和输出，但给函数添加一些附加行为。

学习三个：

1，增加延迟来避免服务器被请求淹没； 2. 每n次调用将信息输出到控制台，帮助检查长时间运行的进程； 3. 缓存上一步计算结果来改善性能。

urls <- list(
  a1 <- "https://cran.r-project.org/doc/manuals/r-patched/R-intro.pdf",
  a2 <- "https://cran.r-project.org/doc/manuals/r-patched/R-data.pdf",
  a3 <- "https://cran.r-project.org/doc/manuals/r-release/R-lang.pdf",
  a4 <- "https://cran.r-project.org/doc/manuals/r-release/R-ints.pdf"
)

download_file <- function(url,...) {
  download.file(url, basename(url), ...)
}
lapply(urls, download_file)

想要添加行为：

每10个URL输出一个点，知道程序还在运行；
两个请求之间增加一个小延迟；

如果用for循环写

i <- 1
for(url in urls) {
  i <- i+1
  if (i %% 10 == 0) cat(".")
  Sys.delay(1)
  download_file(url)
}

创建函数运算符，对行为封装

lapply(urls, dot_every(10, delay_by(1, download_file)))

可以很直接的实现delay_by

delay_by <- function(delay, f){
  function(...) {
    Sys.sleep(delay)
    f(...)
  }
}

system.time(runif(100))

## 用户 系统 流逝 
##    0    0    0

system.time(delay_by(1, runif)(100))

## 用户 系统 流逝 
## 0.00 0.00 1.02

dot_every()需要一个计数器

dot_every <- function(n, f) {
  i <- 1
  function(...) {
    if (i %% n == 0) cat(".")
    i <<- i+1
    f(...)
  }
}

x <- lapply(1:100,runif)

x <- lapply(1:100,dot_every(4,runif))

## .........................

在每个函数运算符中，都把函数设置为最后一个参数，这样在组合多个函数运算符时，使得代码更容易读。

12.1.1 缓存

library(memoise)

slow_function <- function(x) {
  Sys.sleep(1)
  10
}
system.time(slow_function())

## 用户 系统 流逝 
## 0.00 0.00 1.02

system.time(slow_function())

## 用户 系统 流逝 
## 0.00 0.00 1.01

fast_function <- memoise(slow_function)
system.time(fast_function())

## 用户 系统 流逝 
## 0.00 0.00 1.17

system.time(fast_function())

## 用户 系统 流逝 
## 0.06 0.00 0.06

缓存是用内存换速度。一个被缓存的函数可以运行的非常快，因为它存储以前的输入和输出，使用更多的内存。

fib <- function(n) {
  if (n<2) return(1)
  fib(n-2)+fib(n-1)
}

system.time(fib(33))

## 用户 系统 流逝 
## 6.49 0.00 6.49

system.time(fib(34))

##  用户  系统  流逝 
## 10.42  0.00 10.47

fib2 <- memoise(fib)
system.time(fib2(33))

## 用户 系统 流逝 
## 6.49 0.00 6.51

system.time(fib2(34))

##  用户  系统  流逝 
## 10.39  0.02 10.44

12.1.2 捕获函数调用

ignore <- function(...) NULL
tee <- function(f, on_input = ignore, on_output = ignore){
  function(...) {
    on_input(...)
    output <- f(...)
    on_output(output)
    output
  }
}

可以使用tee来查看泛函uniroot的内部，如何通过迭代得到结果。

g <- function(x) cos(x) - x
zero <- uniroot(g, c(-5,5))
zero

## $root
## [1] 0.7390853
## 
## $f.root
## [1] -2.603993e-07
## 
## $iter
## [1] 6
## 
## $init.it
## [1] NA
## 
## $estim.prec
## [1] 6.103516e-05

show_x <- function(x, ... ) cat(sprintf("%+.08f",x),"\n")
zero <- uniroot(tee(g, on_input = show_x), c(-5,5))

## -5.00000000 
## +5.00000000 
## +0.28366219 
## +0.87520341 
## +0.72298040 
## +0.73863091 
## +0.73908529 
## +0.73902425 
## +0.73908529

zero <- uniroot(tee(g, on_output = show_x), c(-5,5))

## +5.28366219 
## -4.71633781 
## +0.67637474 
## -0.23436269 
## +0.02685676 
## +0.00076012 
## -0.00000026 
## +0.00010189 
## -0.00000026

12.2.3 惰性

funs <- list(mean=mean,sum=sum)
funs_m <- lapply(funs, delay_by, delay=0.1)

funs_m$mean(1:10)

## [1] 5.5

funs_m$sum(1:10)

## [1] 55

惰性问题好像不存在了，不用刻意使用force了。

12.1.4 练习

ftimestep <- function(f){
  function(...){
    cat(as.character(Sys.time()))
    cat("\n")
    res <- f(...)
    res
  }
}

ftimestep(sum)(1:10)

## 2018-11-08 11:32:21

## [1] 55

ftimestep(mean)(1:100)

## 2018-11-08 11:32:21

## [1] 50.5

12.2 输出函数运算符

Negate <- function(f) {
  force(f)
  function(...) !f(...)
}

Negate(is.null)(NULL)

## [1] FALSE

failwith <- function(default = NULL, f, quiet = FALSE) {
  force(f)
  function(...) {
    out <- default
    try(out <- f(...), silent = quiet)
    out
  }
}

failwith(f=log)("a")

## NULL

12.3 输入函数运算符

12.3.1 预填充函数参数：拒不函数应用

library(pryr)

## 
## 载入程辑包：'pryr'

## The following object is masked _by_ '.GlobalEnv':
## 
##     where

f <- function(a) g(a, b=1)
compact <- function(x) Filter(Negate(is.null), x)
Map(function(x,y) f(x, y, zs), xs, ys)

f <- partial(g, b=1)
compact <- partial(Filter, Negate(is.null))
Map(partial(f, zs = zs), xs, ys)

对于函数列表

funs2 <- list(
  sum = function(...) sum(..., na.rm = T),
  mean = function(...) mean(..., na.rm = T),
  median = function(...) median(..., na.rm = T)
)

改写成

funs2 <- list(
  sum = partial(sum, na.rm = T),
  mean = partial(mean, na.rm = T),
  median= partial(median, na.rm = T)
)

12.3.2 改变输入类型

base::Vectorize() 可以将一个标量函数转换成一个向量函数；
splat将接收多个参数的函数转换成只接收一个参数列表的函数；
plyr::colwise() 将向量函数转换成处理数据框的函数；

12.4 组合函数运算符

plyr::each接收一个向量化函数的列表并将它们组合为一个函数

summarise <- plyr::each(mean,sd,median)
summarise(1:10)

##    mean      sd  median 
## 5.50000 3.02765 5.50000

12.4.1 函数复合

sapply(mtcars, function(x) length(unique(x)))

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##   25    3   27   22   22   29   30    2    2    3    6

compose <- function(f,g) {
  function(...) f(g(...))
}

library(pryr)
sapply(mtcars,compose(length,unique))

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##   25    3   27   22   22   29   30    2    2    3    6

"%o%" <- compose
sapply(mtcars, length %o% unique)

##  mpg  cyl disp   hp drat   wt qsec   vs   am gear carb 
##   25    3   27   22   22   29   30    2    2    3    6

sqrt(1+8)

## [1] 3

compose(sqrt, `+`)(1+8)

## [1] 3

(sqrt %o% `+`)(1+8)

## [1] 3

`+`(1,8) %>% sqrt

## [1] 3

square <- function(x) x^2
deviation <- function(x) x - mean(x)
sd2 <- sqrt %o% mean %o% square%o% deviation
sd2(1:10)

## [1] 2.872281

12.4.2 逻辑判断和布尔代数

and <- function(f1, f2) {
  force(f1); force(f2)
  function(...) {
    f1(...) && f2(...)
  }
}


or <- function(f1, f2) {
  force(f1); force(f2)
  function(...) {
    f1(...) || f2(...)
  }
}

not <- function(f) {
  force(f)
  function(...) {
    !f(...)
  }
}

Filter(or(is.character,is.factor),iris) %>% head()

##   Species
## 1  setosa
## 2  setosa
## 3  setosa
## 4  setosa
## 5  setosa
## 6  setosa

Advanced R(4)——FP

ZhiYuan Wang

2018/11/08