代码之家  ›  专栏  ›  技术社区  ›  Nicolas2

最快的方法来重新编码范围的一年中的一天到一个月和计数?

  •  4
  • Nicolas2  · 技术社区  · 6 年前

    我有一个包含两组天数的数据集(一天被编码为一年中的天数)。对于每一行,我要计算这些范围对应的每月总天数。

    d <- data.frame(deb = c(1, 32, 90, 91), fin = c(31, 59, 91, 91),
                    deb2 = c(50, 0, 0, 0), fin2 = c(60, 0, 0, 0))
    
    d
    #  deb fin deb2 fin2
    #1   1  31   50   60
    #2  32  59    0    0
    #3  90  91    0    0
    #4  91  91    0    0
    

    例如,对于第1行,第一个范围(从“deb”到“fin”)从第1天到第31天,第二个范围从第50天到第60天。

    在计算了这两个范围内每月的天数之后,我希望得到如下结果:

    #     jan feb  mar
    #[1,]  31  10    1
    #[2,]   0  28    0
    #[3,]   0   0    2
    #[4,]   0   0    1
    

    (NAs而不是零不是问题)

    f1<-function(deb,fin,deb2,fin2,...) {
      f<-factor(c(deb:fin,deb2:fin2))
      levels(f)<-list(jan=1:31,feb=32:59,mar=60:91)
      table(f)
    }
    g1 <- function() do.call(rbind,d %>% pmap(f1))
    
    K <- vector(10,mode="character")
    K[1:31] <- "jan"; K[32:59] <- "feb"; K[60:91] <- "mar"
    f2 <- Vectorize(function(deb,fin,deb2,fin2) table(c(K[deb:fin],K[deb2:fin2])))
    g2 <- function() do.call(bind_rows,f2(d$deb,d$fin,d$deb2,d$fin2))
    
    L <- K
    names(L) <- 1:91
    f3 <- Vectorize(function(deb,fin,deb2,fin2) c(L[deb:fin],L[deb2:fin2]))
    g3 <- function() {
      as.matrix(do.call(bind_rows,f3(d$deb,d$fin,d$deb2,d$fin2))) -> m
      z <- unlist(map(list("jan","feb","mar"),
                       function(y) apply(m,1,function(x) sum(x==y,na.rm=TRUE))))
      dim(z)<-c(nrow(d),3)
      z
    

    }

    更新

    firstOfMths <- seq(as.Date("2018-01-01"), as.Date("2019-01-01"), by="month")
    daysPerMth <- c(1L, cumsum(as.integer(diff(firstOfMths))))
    chinsoon12 <- function() 
      t(apply(d, 1, function(x)
          table(cut(c(x["deb"]:x["fin"],x["deb2"]:x["fin2"]), daysPerMth, labels=month.abb, include.lowest=TRUE, right=TRUE))
    

    N <- 500
    d<-data.frame(deb=rep(c(1,32,90,91),N),fin=rep(c(31,59,91,91),N),deb2=rep(c(50,0,0,0),N),fin2=rep(c(60,0,0,0),N))
    microbenchmark(g1(),g2(),g3(),chinsoon12())
    #Unit: milliseconds
    # expr              min       lq     mean   median       uq      max neval
    # g1()         571.3890 615.1020 649.7619 639.6632 662.4808 976.9566   100
    # g2()         306.7141 341.3056 360.9687 353.1227 373.8194 505.0882   100
    # g3()         282.2767 304.4331 320.4908 314.2377 325.8846 543.4680   100
    # chinsoon12() 429.7627 469.6998 500.6289 488.5176 512.0520 729.0995   100
    
    2 回复  |  直到 6 年前
        1
  •  1
  •   Henrik plannapus    6 年前

    使用 findInterval Map table

    # create breaks to be used in findInterval
    b <- <- as.numeric(format(seq(as.Date("2018-01-01"), as.Date("2018-12-31"), by = "month"), "%j"))
    
    # use Map to expand the day of year ranges by row
    # use findInterval to convert day of year to month number
    # use the month numbers to index month.abb 
    l <- Map(function(from, to, from2, to2) month.abb[findInterval(c(from:to, from2:to2), b)], d$deb, d$fin, d$deb2, d$fin2)
    
    # create a row index
    i <- rep(1:nrow(d), lengths(l))
    
    # use table to get a contigency table of row indices and months
    table(i, factor(unlist(l), levels = month.abb))
    # i   Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
    #   1  31  10   1   0   0   0   0   0   0   0   0   0
    #   2   0  28   0   0   0   0   0   0   0   0   0   0
    #   3   0   0   1   1   0   0   0   0   0   0   0   0
    #   4   0   0   0   1   0   0   0   0   0   0   0   0
    

    似乎比 g3() 在更大的数据集上( d <- d[rep(1:nrow(d), 1e4), ]

        2
  •  0
  •   Community CDub    4 年前

    行动组要求找到 最快的方法来重新编码的范围从一年中的一天到一个月和计数 而且已经 mentioned

    虽然这个问题有一个公认的答案,但我还是很纳闷

    1. 如何 方法使用 melt() , foverlaps() ,和 dcast()
    2. 以及一个具有不同问题规模的更现实的基准是什么样的。

    library(data.table)
    library(magrittr)
    cols <- c("deb", "fin")
    # reshape sub ranges from wide to long format
    long <- melt(setDT(d)[, rn := .I], id.vars = "rn", measure.vars = patterns(cols),
         value.name = cols)[deb > 0]
    # create data.table with monthly breaks and set keys
    b <- seq(as.IDate("2018-01-01"), as.IDate("2019-01-01"), "month")
    mDT <- data.table(abb = forcats::fct_inorder(month.abb), 
                      deb = yday(head(b, 12L)), 
                      fin = yday(tail(b, 12L) - 1L),
                      key = c("deb", "fin"))
    # find overlaps between sub ranges and monthly breaks
    foverlaps(long, mDT)[
      # compute days in overlaps
      , days := pmin(fin, i.fin) - pmax(deb, i.deb) + 1L] %>%
      # reshape to wide format for final result
      dcast(rn ~ abb, sum, value.var = "days", fill = 0L, drop = FALSE)
    
       rn Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
    1:  1  31  10   1   0   0   0   0   0   0   0   0   0
    2:  2   0  28   0   0   0   0   0   0   0   0   0   0
    3:  3   0   0   1   1   0   0   0   0   0   0   0   0
    4:  4   0   0   0   1   0   0   0   0   0   0   0   0
    

    基准

    下面的基准代码比较

    AntoniosK's tidyverse answer (now deleted) 没有被考虑,因为OP commented 这种方法比他自己的慢30倍

    对于基准,问题的大小是不同的( n_rows

    不同的方法在结果的类别上也不同(即。, data.table , matrix , tibble ,和 table

    library(bench)
    library(data.table)
    library(magrittr)
    library(ggplot2)
    
    bm2 <- press(
      p_sr_2 = c(0.1, 0.5, 1), # share or rows with 2nd sub range ]0, 1]
      n_rows = 10^seq(2, 4, 1),
      { # create test data
        set.seed(1L)
        d0 <- t(replicate(n_rows, sample(365L, 4L) %>% sort())) %>% data.table()
        setnames(d0, c("deb", "fin", "deb2", "fin2"))
        idx <- sample(nrow(d0), (1 - p_sr_2) * nrow(d0))
        d0[idx, c("deb2", "fin2") := 0L]
        str(d0)
        
        mark(
          foverlaps = {
            d <- copy(d0)
            cols <- c("deb", "fin")
            long <- melt(setDT(d)[, rn := .I], id.vars = "rn", measure.vars = patterns(cols),
                         value.name = cols)[deb > 0]
            b <- seq(as.IDate("2018-01-01"), as.IDate("2019-01-01"), "month")
            mDT <- data.table(abb = forcats::fct_inorder(month.abb), 
                              deb = yday(head(b, 12L)), 
                              fin = yday(tail(b, 12L) - 1L),
                              key = c("deb", "fin"))
            foverlaps(long, mDT)[, days := pmin(fin, i.fin) - pmax(deb, i.deb) + 1L] %>% 
              dcast(rn ~ abb, sum, value.var = "days", fill = 0L, drop = FALSE)
            # returns a data.table with 13 columns and 0 for missing values
          },
          g1 = {
            f1 <- function(deb, fin, deb2, fin2, ...) {
              f <- factor(c(deb:fin, deb2:fin2))
              levels(f) <- list(jan = 1:31,
                                feb = 32:59,
                                mar = 60:90,
                                apr = 91:120,
                                may = 121:151,
                                jun = 152:181,
                                jul = 182:212,
                                aug = 213:243,
                                sep = 244:273,
                                oct = 274:304,
                                nov = 305:334,
                                dec = 335:365)
              table(f)
            }
            do.call(rbind, d %>% purrr::pmap(f1))
            # returns a matrix with 12 named columns and 0 for missing values
          },
          g2 = {
            K <- vector(10, mode = "character")
            K[1:31]    <- "jan"
            K[32:59]   <- "feb"
            K[60:90]   <- "mar"
            K[91:120]  <- "apr"
            K[121:151] <- "may"
            K[152:181] <- "jun"
            K[182:212] <- "jul"
            K[213:243] <- "aug"
            K[244:273] <- "sep"
            K[274:304] <- "oct"
            K[305:334] <- "nov"
            K[335:365] <- "dec"
            f2 <-
              Vectorize(function(deb, fin, deb2, fin2)
                table(c(K[deb:fin], K[deb2:fin2])))
            template <- matrix(
              integer(0), ncol = 12L, 
              dimnames = list(NULL, c("jan", "feb", "mar", "apr", "may", "jun", 
                                      "jul", "aug", "sep", "oct", "nov", "dec"))) %>% 
              tibble::as.tibble()
            do.call(dplyr::bind_rows, c(list(template), f2(d$deb, d$fin, d$deb2, d$fin2))) 
            # returns a tibble with 12 columns and NA for missing values 
          },
          g3 = {
            K <- vector(10, mode = "character")
            K[1:31]    <- "jan"
            K[32:59]   <- "feb"
            K[60:90]   <- "mar"
            K[91:120]  <- "apr"
            K[121:151] <- "may"
            K[152:181] <- "jun"
            K[182:212] <- "jul"
            K[213:243] <- "aug"
            K[244:273] <- "sep"
            K[274:304] <- "oct"
            K[305:334] <- "nov"
            K[335:365] <- "dec"
            names(K) <- 1:365
            f3 <-
              Vectorize(function(deb, fin, deb2, fin2)
                c(K[deb:fin], K[deb2:fin2]))
            m <- as.matrix(do.call(dplyr::bind_rows, f3(d$deb, d$fin, d$deb2, d$fin2)))
            z <- unlist(purrr::map(list("jan", "feb", "mar", "apr", "may", "jun", 
                                        "jul", "aug", "sep", "oct", "nov", "dec"),
                                   function(y)
                                     apply(m, 1, function(x)
                                       sum(x == y, na.rm = TRUE))))
            dim(z) <- c(nrow(d), 12)
            z
            # returns a matrix with 12 columns and 0 for missing values
          },
          henrik = {
            d <- copy(d0)
            b <- as.numeric(format(seq(as.Date("2018-01-01"), as.Date("2018-12-31"), 
                                       by = "month"), "%j"))
            l <- Map(
              function(from, to, from2, to2) month.abb[findInterval(c(from:to, from2:to2), b)], 
              d$deb, d$fin, d$deb2, d$fin2)
            i <- rep(1:nrow(d), lengths(l))
            table(i, factor(unlist(l), levels = month.abb))
            # returns an object of class table with 12 columns and 0 for missing values
          },
          chinsoon12 = {
            d <- copy(d0)
            firstOfMths <- seq(as.Date("2018-01-01"), as.Date("2019-01-01"), by="month")
            daysPerMth <- c(1L, cumsum(as.integer(diff(firstOfMths))))
            g <- ceiling(seq(1, ncol(d)) / 2)
            t(apply(d, 1, function(x) {
              x <- unlist(by(x, g, function(k) seq(k[1L], k[2L])), use.names=FALSE)
              table(cut(x, daysPerMth, labels=month.abb, include.lowest=TRUE, right=TRUE))
            }))
            # returns a matrix with 12 named columns and 0 for missing values
          },
          check = function(x, y) {
            cat("Run check: ")
            xdt <- as.data.table(x) %>% .[, .SD, .SDcols = tail(seq_len(ncol(.)), 12L)] 
            if (tibble::is_tibble(y)) {
              y <- dplyr::mutate_all(y, function(x) dplyr::if_else(is.na(x), 0L, x))
            }
            if (is.table(y)) y <- matrix(y, ncol = 12L)
            ydt <- as.data.table(y) %>% .[, .SD, .SDcols = tail(seq_len(ncol(.)), 12L)]
            result <- all.equal(xdt, ydt, check.attributes = FALSE)
            if (!isTRUE(result)) {
              print(result)
            } else cat("OK\n")
            return(result)
          }
        )
      }
    )
    

    library(ggplot2)
    autoplot(bm)
    

    enter image description here

    henrik foverlaps 方法明显更快。对于10000行, 弗雷普斯 比其他方法快一到两个数量级。

    bm %>%
      tidyr::unnest() %>%
      ggplot(aes(expression, mem_alloc, color = gc)) +
      ggbeeswarm::geom_quasirandom() +
      coord_flip() +
      facet_grid(p_sr_2 ~ n_rows, labeller = label_both)
      
    

    enter image description here

    同样,请注意对数刻度。

    这个 弗雷普斯 这种方法比其他方法分配的内存少一到两个数量级。

    基准第2部分

    由于运行时间太长(而且我很不耐烦),上面的基准测试最多只能执行10000行。阿尔索 Henrik 只测试了40K行。所以,我想知道 弗雷普斯 该方法将能够在合理的时间内处理10m行(OP的生产数据集的大小)。

    弗雷普斯 方法以OP规定的第二个范围的份额固定为25%作为基准。

    library(bench)
    library(data.table)
    library(magrittr)
    library(ggplot2)
    bm5 <- press(
      n_rows = 10^seq(2, 7, 1),
      { # create test data
        cat("Start to create test data", format(Sys.time()), "\n")
        p_sr_2 <- 0.25 # share or rows with 2nd sub range ]0, 1]
        set.seed(1L)
        long <- data.table(rn = rep(seq_len(n_rows), each = 4L),
                          day = sample(365L, 4L * n_rows, replace = TRUE))
        setorder(long, rn, day)
        dups <- long[, which(anyDuplicated(day) > 0), by = rn]$rn
        if (length(dups) > 0) long[
          rn %in% dups, 
          day := replicate(length(dups), sample(365L, 4L) %>% sort(), simplify = FALSE) %>% unlist()]
        d0 <- dcast(long, rn ~ rowid(rn), value.var = "day")[, rn := NULL]
        setnames(d0, c("deb", "fin", "deb2", "fin2"))
        idx <- sample(nrow(d0), (1 - p_sr_2) * nrow(d0))
        d0[idx, c("deb2", "fin2") := 0L]
        str(d0)
        rm(long)  # free the memory
        tables()
        cat("Test data created", format(Sys.time()), "\n")
        
        mark(
          foverlaps = {
            d <- copy(d0)
            cols <- c("deb", "fin")
            long <- melt(setDT(d)[, rn := .I], id.vars = "rn", measure.vars = patterns(cols),
                         value.name = cols)[deb > 0]
            b <- seq(as.IDate("2018-01-01"), as.IDate("2019-01-01"), "month")
            mDT <- data.table(abb = forcats::fct_inorder(month.abb), 
                              deb = yday(head(b, 12L)), 
                              fin = yday(tail(b, 12L) - 1L),
                              key = c("deb", "fin"))
            foverlaps(long, mDT)[, days := pmin(fin, i.fin) - pmax(deb, i.deb) + 1L] %>% 
              dcast(rn ~ abb, sum, value.var = "days", fill = 0L, drop = FALSE)
            # returns a data.table with 13 columns and 0 for missing values
          },
          min_time = 2
        )
      }
    )
    

    在我的系统上,10m rows案例的运行时间约为23.7秒。对于超过1000行的问题,运行时间几乎呈线性增长。10m行的轻微向上弯曲可能是由于系统内存限制。

    bm4 %>% 
      tidyr::unnest() %>% 
      ggplot(aes(n_rows, time, colour = expression)) +
      geom_smooth(data = . %>% dplyr::filter(n_rows > 10^3), 
                  method = "lm", se = FALSE, colour = "blue", size = 1) + 
      geom_point(size = 2) + 
      scale_x_log10() +
      stat_summary(fun.y = median, geom = "line", size = 1) +
      ggtitle("time vs n_rows")
    

    enter image description here