代码之家  ›  专栏  ›  技术社区  ›  Guido Berning

用purrr代替sapply计算移动平均值

  •  1
  • Guido Berning  · 技术社区  · 7 年前

    pmap_dbl但结果错误-比较ra和lt&燃气轮机;purrr_ra1。

    map_dbl,但产生了一个错误。

    “mutate_impl(.data,dots)中出错: 柱 purr_ra2

    像zoo和RcppRoll这样具有滚动/窗口操作的包正在考虑窗口的“左”、“右”、“中”对齐,在我的情况下不是这样。

    有人能帮忙吗?

    library(tidyverse)
      df <- tribble(            
        ~Day,   ~val,   ~bw,    ~fw,
        '01-01-2020',   0,  8,  4,
        '02-01-2020',   73.5,   8,  4,
        '03-01-2020',   540,    8,  4,
        '04-01-2020',   0,  8,  4,
        '05-01-2020',   57, 8,  4,
        '06-01-2020',   20, 8,  4,
        '07-01-2020',   690,    8,  4,
        '08-01-2020',   40, 8,  4,
        '09-01-2020',   38, 8,  4,
        '10-01-2020',   60, 8,  4,
        '11-01-2020',   0,  8,  4,
        '12-01-2020',   40, 8,  4,
        '13-01-2020',   40, 8,  4,
        '14-01-2020',   225,    8,  4,
        '15-01-2020',   77, 8,  4,
        '16-01-2020',   0,  8,  4,
        '17-01-2020',   153,    8,  4,
        '18-01-2020',   950,    8,  4,
        '19-01-2020',   124,    8,  4,
        '20-01-2020',   80, 8,  4,
        '21-01-2020',   0,  8,  4,
        '22-01-2020',   80, 8,  4,
        '23-01-2020',   766.5,  8,  4,
        '24-01-2020',   334,    8,  4,
        '25-01-2020',   660,    8,  4,
        '26-01-2020',   120,    8,  4,
        '27-01-2020',   545,    8,  4,
        '28-01-2020',   145,    8,  4,
        '29-01-2020',   38.5,   8,  4,
        '30-01-2020',   20, 8,  4,
        '31-01-2020',   760,    8,  4)
      df <- df %>% mutate(Day = as.Date(Day,"%d-%m-%Y"),
                          fw = as.integer(fw),
                          bw = as.integer(bw))
      df <- df %>% mutate(ra = sapply(seq_along(df$Day), function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
      df <- df %>% mutate(purrr_ra1 = pmap_dbl(., function(x,val, Day, fw, bw, ...) mean(val[Day <= Day[x] + fw[x] & Day > Day[x] - bw[x]])))
      # df <- df %>% mutate(purrr_ra2 = map_dbl(., function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
    
    3 回复  |  直到 7 年前
        1
  •  1
  •   G. Grothendieck    7 年前

    事实上 rollapply

    答案1使用单个偏移向量,适用于问题中每行偏移相同的情况。

    答案2比这里需要的更具一般性,但如果偏移量因行而异,则将非常有用。

    width=list(...) 通过在输入两侧填充适当数量的NAs来实现功能。

    library(zoo)
    
    # baseline for comparison - from question
    ans0 <- sapply(seq_along(df$Day), function(x) {
     mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])
    })
    
    # 1
    ans1 <- rollapply(df$val, list(seq(-7, 4)), mean, partial = TRUE)
    
    # 2
    w <- Map(seq, -df$bw + 1, df$fw)
    ans2 <- rollapply(df$val, w, mean, partial = TRUE)
    
    # 3
    ans3 <- rollapply(c(rep(NA, 7), df$val, rep(NA, 4)), 12, mean, na.rm = TRUE)
    
    identical(ans0, ans1)
    ## [1] TRUE
    
    identical(ans0, ans2)
    ## [1] TRUE
    
    identical(ans0, ans3)
    ## [1] TRUE
    

    df

    df <- structure(list(Day = structure(c(18262, 18263, 18264, 18265, 
    18266, 18267, 18268, 18269, 18270, 18271, 18272, 18273, 18274, 
    18275, 18276, 18277, 18278, 18279, 18280, 18281, 18282, 18283, 
    18284, 18285, 18286, 18287, 18288, 18289, 18290, 18291, 18292
    ), class = "Date"), val = c(0, 73.5, 540, 0, 57, 20, 690, 40, 
    38, 60, 0, 40, 40, 225, 77, 0, 153, 950, 124, 80, 0, 80, 766.5, 
    334, 660, 120, 545, 145, 38.5, 20, 760), bw = c(8L, 8L, 8L, 8L, 
    8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 
    8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), fw = c(4L, 4L, 4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L)), .Names = c("Day", 
    "val", "bw", "fw"), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
    -31L))
    
        2
  •  1
  •   Davis Vaughan    7 年前

    对于这个特定的问题,我们可以利用常数偏移并使用 tidyquant

    library(tidyquant)
    
    df$ra2 <- df %>%
      tq_transmute(val, lag.xts, k = -4:7) %>%
      select(-Day) %>%
      rowMeans(na.rm = TRUE)
    

    假设 df 如原问题所示。对于灵活的偏移,我喜欢@g-grothendieck的方法

        3
  •  0
  •   Guido Berning    7 年前

    这是历史和基准。

    你的解决方案是 最好的!

      library(tidyverse)
      library(zoo)
      library(microbenchmark)
    
      # df as in the initial coding       
    
      # history
      # 1. version with lapply
      calc_ra = function(x, df) {
        begin_date = x - df$bw[df$Day == x]
        end_date = x + df$fw[df$Day == x]
        res <- df %>% filter(Day > begin_date &
                                    Day <= end_date) %>%
          summarize(mv = mean(val))
        return(res)
      }
      ra_lapply <- function(df) {
        df <- data.frame(df, ra_lapply = unlist(lapply(df$Day, function(x)
          calc_ra(x, df))))
      }
      # 2. version with zoo 3 month ago
      ra_rollapply1 <- function(df){      
        df <- df %>% mutate(w1 = as.double.difftime(bw - 1 + fw))
        df <- df %>% mutate(ra_rollapply1 = rollapply(val, w1, mean, partial = TRUE))
      }
      # 3. version with sapply
      ra_sapply <- function(df){
        df <-
          df %>% mutate(ra_sapply = sapply(seq_along(df$Day), function(x)
            mean(df$val[df$Day <= df$Day[x] + df$fw[x] &
                          df$Day > df$Day[x] - df$bw[x]])))
      }
      # 4. version from yesterday
      ra_map_dbl <- function(df){
        df <- df %>% mutate(ra_map_dbl = map_dbl(seq_along(df$Day), function(x) mean(df$val[df$Day <= df$Day[x] + df$fw[x] & df$Day > df$Day[x] - df$bw[x]])))
      }
      # 5. version with zoo from yesterday
      ra_rollapply2 <- function(df){
        w <- Map(seq, -df$bw + 1, df$fw)
        df <- df %>% mutate(ra_rollapply2 = rollapply(val, w, mean, partial = TRUE))
      }
    
      df1 <- ra_lapply(df)
      df2 <- ra_rollapply1(df1)
      df3 <- ra_sapply(df1)
      df4 <- ra_map_dbl(df1)
      df5 <- ra_rollapply2(df1)
      identical(df2$ra_lapply, df2$ra_rollapply1)
      [1] FALSE
      identical(df3$ra_lapply, df3$ra_sapply)
      [1] TRUE
      identical(df4$ra_lapply, df4$ra_map_dbl)
      [1] TRUE
      identical(df5$ra_lapply, df5$ra_rollapply2)
      [1] TRUE
    
      res <- microbenchmark(
        ra_lapply(df), 
        ra_rollapply1(df),
        ra_sapply(df), 
        ra_map_dbl(df), 
        ra_rollapply2(df), 
        times=1000L)
    
      print(res)
    
    Unit: milliseconds
                  expr        min         lq       mean     median         uq        max neval
         ra_lapply(df) 104.205800 111.077701 119.316653 113.290395 116.749113 287.685832  1000
     ra_rollapply1(df)   4.318322   4.606702   5.140784   4.744533   5.017736  17.593661  1000
         ra_sapply(df)  15.383019  16.301282  17.992554  16.738366  18.629451  83.400164  1000
        ra_map_dbl(df)  15.418707  16.352354  17.965034  16.823075  18.628220 106.660109  1000
     ra_rollapply2(df)   2.629061   2.825758   3.229295   2.926465   3.099371   9.891077  1000