代码之家 › 专栏 › 技术社区 › ira

如何找到购买物品及其组合的次数?

market-basket-analysis data.table r

ira · 技术社区 · 6 年前

我有一个 data.table 显示客户购买的项目。每行代表一个客户,每列代表一个项目。表中每个客户的列数和列中的值相同 item* 根据客户是否购买了给定的项目,为1或0。该表的简单版本如下所示:

data.table(customerID = c(1,2,3,4,5),
           item1 = c(1,0,0,1,1),
           item2 = c(1,0,1,1,1),
           item3 = c(1,0,0,0,1),
           item4 = c(0,1,1,1,1))

表中显示,客户1购买了项目1、2、3,项目3由客户1和5购买。

在实际案例中数据表有太多的列,在代码中按名称引用它们是不切实际的,但将数据改为长格式是可以的。

我需要知道每件商品的购买次数和组合购买的次数。在这种情况下,我希望得到类似于:

item1 3
item2 4
item3 2
item4 4
item1;item2 3
item1;item3 2
item1;item4 1
...
(same for other combinations of length 2)
...
item1;item2;item3 2
item1;item2;item4 1

...
up to combinations of 4 items.

此外,我还需要为每个客户提供一个表格,说明他或她购买了哪些产品组合。

编辑:

0 回复 | 直到 6 年前

Ronak Shah 6 年前

这是一个完全基本的R选项,因此可以将数据转换为数据帧

df <- data.frame(df)
unique_product <- names(df[-1])

stack(unlist(sapply(seq_along(unique_product), function(x) 
     combn(unique_product, x, FUN = function(y) 
           setNames(sum(rowSums(df[y] == 1) == length(y)), 
            paste0(y, collapse = ";")), simplify = FALSE))))


#   values                     ind
#1       3                   item1
#2       4                   item2
#3       2                   item3
#4       4                   item4
#5       3             item1;item2
#6       2             item1;item3
#7       2             item1;item4
#8       2             item2;item3
#9       3             item2;item4
#10      1             item3;item4
#11      2       item1;item2;item3
#12      2       item1;item2;item4
#13      1       item1;item3;item4
#14      1       item2;item3;item4
#15      1 item1;item2;item3;item4

combn 对于每一个组合,通过从dataframe中细分相应的列来计算它们中有多少是一起出现的。

为了获得解锁某些组合的客户,我们可以继续采用相同的方法

stack(unlist(sapply(seq_along(unique_product), function(x) 
     combn(unique_product, x, FUN = function(y) {
      inds <- rowSums(df[x] == 1) == length(x)
      setNames(df$customerID[inds], 
             rep(paste0(y, collapse = ";"), sum(inds)))
             }, simplify = FALSE))))

#   values                     ind
#1       1                   item1
#2       1                   item2
#3       1                   item3
#4       1                   item4
#5       1             item1;item2
#6       4             item1;item2
#7       5             item1;item2
#8       1             item1;item3
#9       4             item1;item3
#10      5             item1;item3
#....

如果需要,可以重命名这些列 values 客户Id和 ind 是各自客户解锁的组合。

Wimpel 6 年前

样本数据

DT <- data.table(customerID = c(1,2,3,4,5),
           item1 = c(1,0,0,1,1),
           item2 = c(1,0,1,1,1),
           item3 = c(1,0,0,0,1),
           item4 = c(0,1,1,1,1))

代码

#identify columns with items, grab their names
cols <- names(DT[,-1])

在下面的代码中:set 1:length(cols) 1:n 如果你想要最多n个产品的组合

#put all combinations of items in a list
combos <- unlist( lapply( 1:length(cols), combn, x = cols, simplify = FALSE ), recursive = FALSE )

#calculate number of sold items per combo
l <- lapply( combos, function(x) {
  nrow( DT[ rowSums( DT[, x, with = FALSE ] ) == length( x ), ] )
})

#name the list based on the combo
names(l) <- lapply( combos, paste0, collapse = ";")

输出

str( l )

List of 15
$ item1                  : int 3
$ item2                  : int 4
$ item3                  : int 2
$ item4                  : int 4
$ item1;item2            : int 3
$ item1;item3            : int 2
$ item1;item4            : int 2
$ item2;item3            : int 2
$ item2;item4            : int 3
$ item3;item4            : int 1
$ item1;item2;item3      : int 2
$ item1;item2;item4      : int 2
$ item1;item3;item4      : int 1
$ item2;item3;item4      : int 1
$ item1;item2;item3;item4: int 1

或者创建一个数据表

as.data.table( as.matrix( unlist(l), ncol = 2, nrow = length(l) ), keep.rownames = TRUE )

#                         rn V1
# 1:                   item1  3
# 2:                   item2  4
# 3:                   item3  2
# 4:                   item4  4
# 5:             item1;item2  3
# 6:             item1;item3  2
# 7:             item1;item4  2
# 8:             item2;item3  2
# 9:             item2;item4  3
#10:             item3;item4  1
#11:       item1;item2;item3  2
#12:       item1;item2;item4  2
#13:       item1;item3;item4  1
#14:       item2;item3;item4  1
#15: item1;item2;item3;item4  1

s_baldur 6 年前

n_items 它控制包的最大大小:

library(magrittr)
DT_melt <- DT[, melt(.SD, id.vars = "customerID", variable.factor = FALSE)
              ][value == 1
                ][, variable := as.integer(sub("item", "", variable))]
n_items <- 4L
keep_track <- list()
for (i in seq_len(n_items)) {
  combs <- combn(seq_len(n_items), i)
  keep_track[[i]] <- apply(combs, 2, function(x)  DT_melt[, all(x %in% variable), by = customerID]) %>%
    lapply(function(x) sum(x[[2]])) %>% 
    setNames(apply(combs, 2, function(x) paste(paste0("item", x), collapse = ";")))
}
unlist(keep_track)

返回计数的命名向量:

#                   item1                   item2 
#                       3                       4 
#                   item3                   item4 
#                       2                       4 
#             item1;item2             item1;item3 
#                       3                       2 
#             item1;item4             item2;item3 
#                       2                       2 
#             item2;item4             item3;item4 
#                       3                       1 
#       item1;item2;item3       item1;item2;item4 
#                       2                       2 
#       item1;item3;item4       item2;item3;item4 
#                       1                       1 
# item1;item2;item3;item4 
#                       1