代码之家 › 专栏 › 技术社区 › George Shimanovsky

分组累计识别新值data.table 在r

tidyverse data.table dplyr dataframe r

George Shimanovsky · 技术社区 · 6 年前

如何创建一个新列来标识外观中的新值 Letter 由一组独特的梳子累积而成的列 Year + Month ?

数据样本。

require(data.table)
dt <- data.table(Letter = c(LETTERS[c(5, 1:2, 1:2, 1:4, 3:6)]),
                 Year = 2018,
                 Month = c(rep(5,5), rep(6,4), rep(7,4)))

打印。

    Letter Year Month
 1:      E 2018     5
 2:      A 2018     5
 3:      B 2018     5
 4:      A 2018     5
 5:      B 2018     5
 6:      A 2018     6
 7:      B 2018     6
 8:      C 2018     6
 9:      D 2018     6
10:      C 2018     7
11:      D 2018     7
12:      E 2018     7
13:      F 2018     7

    Letter Year Month   New
 1:      E 2018     5  TRUE
 2:      A 2018     5  TRUE
 3:      B 2018     5  TRUE
 4:      A 2018     5  TRUE
 5:      B 2018     5  TRUE
 6:      A 2018     6 FALSE
 7:      B 2018     6 FALSE
 8:      C 2018     6  TRUE
 9:      D 2018     6  TRUE
10:      C 2018     7 FALSE
11:      D 2018     7 FALSE
12:      E 2018     7 FALSE
13:      F 2018     7  TRUE

详细问题:

第2组中的哪个字母(“A”、“B”、“C”、“D”)在第1组中不重复。

3 回复 | 直到 6 年前

Frank 6 年前

初始化为FALSE;然后将每个字母加入到第一年的月份,并更新为TRUE:

dt[, v := FALSE]
dt[unique(dt, by="Letter"), on=.(Letter, Year, Month), v := TRUE][]

    Letter Year Month     v
 1:      E 2018     5  TRUE
 2:      A 2018     5  TRUE
 3:      B 2018     5  TRUE
 4:      A 2018     5  TRUE
 5:      B 2018     5  TRUE
 6:      A 2018     6 FALSE
 7:      B 2018     6 FALSE
 8:      C 2018     6  TRUE
 9:      D 2018     6  TRUE
10:      C 2018     7 FALSE
11:      D 2018     7 FALSE
12:      E 2018     7 FALSE
13:      F 2018     7  TRUE

Andre Elrico 6 年前

简单地说:

 # dt[,new := ifelse(Letter %in% dt$Letter[dt$Month<Month],F,T), by="Month"][]

 #   Letter Year Month   new
 #1:      E 2018     5  TRUE
 #2:      A 2018     5  TRUE
 #3:      B 2018     5  TRUE
 #4:      A 2018     5  TRUE
 #5:      B 2018     5  TRUE
 #6:      A 2018     6 FALSE
 #7:      B 2018     6 FALSE
 #8:      C 2018     6  TRUE
 #9:      D 2018     6  TRUE
#10:      C 2018     7 FALSE
#11:      D 2018     7 FALSE
#12:      E 2018     7 FALSE
#13:      F 2018     7  TRUE

David A.的评论非常有效,这是一个更快、更不冗长的版本:(

dt[, new := !(Letter %in% dt$Letter[dt$Month<Month]), by=Month][]

chinsoon12 6 年前

另一种可能的方法:

dupes <- c()
dt[, New := {
    x <- !Letter %chin% dupes
    dupes <- c(dupes, unique(Letter[x]))
    x
}, by=.(Year, Month)]

如果字母是整数:

library(microbenchmark)
microbenchmark(mtd0=dt0[, New := !(Letter %in% dt0$Letter[dt0$Month<Month]), by=Month],
    mtd1={
        dt1[, v := FALSE]
        dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
    },
    mtd2={
        dupes <- c()
        dt2[, New := {
            x <- !Letter %in% dupes
            dupes <- c(dupes, unique(Letter[x]))
            x
        }, by=.(Year, Month)]        
    },
    times=3L)

整数定时输出:

Unit: milliseconds
 expr       min       lq      mean    median        uq      max neval
 mtd0 1293.3100 1318.775 1331.7129 1344.2398 1350.9143 1357.589     3
 mtd1  377.1534  391.178  402.4423  405.2026  415.0868  424.971     3
 mtd2 2015.2115 2020.926 2023.7209 2026.6400 2027.9756 2029.311     3

如果字母是字符:

microbenchmark(mtd0=dt0[, New := !(Letter %chin% dt0$Letter[dt0$Month<Month]), by=Month],
    mtd1={
        dt1[, v := FALSE]
        dt1[unique(dt1, by="Letter"), on=.(Letter, Year, Month), v := TRUE]
    },
    mtd2={
        dupes <- c()
        dt2[, New := {
            x <- !Letter %chin% dupes
            dupes <- c(dupes, unique(Letter[x]))
            x
        }, by=.(Year, Month)]        
    },
    times=3L)

Unit: milliseconds
 expr       min        lq      mean    median        uq       max neval
 mtd0 1658.5806 1689.8941 1765.9329 1721.2076 1819.6090 1918.0105     3
 mtd1  849.2361  851.1807  852.8632  853.1253  854.6768  856.2283     3
 mtd2  420.1013  426.0941  433.9202  432.0869  440.8296  449.5723     3

> identical(dt2$New, dt1$v)
[1] TRUE
> identical(dt0$New, dt1$v)
[1] FALSE

数据:

set.seed(0L)
nr <- 1e7
dt <- unique(data.table(Letter=sample(nr/1e2, nr, replace=TRUE),
    Year=sample(2014:2018, nr, replace=TRUE),
    Month=sample(1:12, nr, replace=TRUE)))
setorder(dt, Year, Month)#[, Letter := as.character(Letter)]
dt0 <- copy(dt)
dt1 <- copy(dt)
dt2 <- copy(dt)

#for seed=0L, dt has about 4.8mio rows