代码之家  ›  专栏  ›  技术社区  ›  arg0naut91

创建具有优化功能的因素组合

  •  3
  • arg0naut91  · 技术社区  · 6 年前
    library(dplyr)
    library(tidyr)
    
    df <- data.frame(
      First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8"),
      Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
                 "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
                 "MW3; MW4; MW5", "MW6; MW3; MW7")
    )
    
    df <- df %>%
      mutate(
        ID = row_number(),
        lmt = n_distinct(ID)
      ) %>%
      separate_rows(Second, sep = "; ") %>%
      group_by(ID) %>%
      mutate(
        wgt = row_number()
      ) %>% ungroup()
    

    假设对于每个id,我只想保留1个 First Second (即 df 应始终等于 lmt )中。

    不过,我想通过优化某些参数来实现这一点。解决方案的设计应确保:

    • wgt 应尽可能选择1,也可选择2,但应避免选择3(即 WGT 应该是最小的);

    • 中某个值的频率之间的差异 第二 和频率 第一 应该接近0。

    关于如何在r中处理这个问题有什么想法吗?

    上述情况的预期输出为:

         ID First Second   wgt   lmt
    1     1   MW3    MW4     1     8
    2     2   MW3    MW7     3     8
    3     3   MW4    MW7     2     8
    4     4   MW5    MW5     1     8
    5     5   MW6    MW3     1     8
    6     6   MW7    MW8     2     8
    7     7   MW7    MW3     1     8
    8     8   MW8    MW6     1     8
    

    为什么?仅仅因为有了这个组合,右边就没有更多的元素了( 第二 )它在左边( 弗斯特 )例如,在右侧和左侧都有两个MW3元素。

    然而,这里要付出的代价是 工作组 不总是1(总和 工作组 不是8而是12)。

    澄清:如果两个标准不能同时最小化,则应优先考虑最小化第二个标准(频率之间的差异)。

    2 回复  |  直到 6 年前
        1
  •  3
  •   pieca    6 年前

    我反复考虑这个问题,我可以使用 小冲突 算法。这里的关键是找到一个结合您的需求的评分函数。下面的实现遵循您的建议' 假设目标应该是将第二个标准(频率之间的差异)最小化。 '.在你的实际数据上尝试其他的评分函数,让我们看看你能走多远。

    根据你的原始数据(8个ID),我得到的解决方案和你发布的一样好:

    > solution_summary(current_solution)
       Name FirstCount SecondCount diff
    1:  MW3          2           2    0
    2:  MW4          1           1    0
    3:  MW5          1           1    0
    4:  MW6          1           1    0
    5:  MW7          2           2    0
    6:  MW8          1           1    0
    [1] "Total freq diff:  0"
    [1] "Total wgt:  12"
    

    对于10000 ids的随机数据,该算法能够找到第一/第二频率无差异的解(但wgt之和大于最小值):

    > solution_summary(current_solution)
       Name FirstCount SecondCount diff
    1:  MW3       1660        1660    0
    2:  MW4       1762        1762    0
    3:  MW5       1599        1599    0
    4:  MW6       1664        1664    0
    5:  MW7       1646        1646    0
    6:  MW8       1669        1669    0
    [1] "Total freq diff:  0"
    [1] "Total wgt:  19521"
    

    代码如下:

    library(data.table)
    df <- as.data.table(df)
    df <- df[, .(ID, First, Second, wgt)]
    
    # PLAY AROUND WITH THIS PARAMETER
    freq_weight <- 0.9
    
    wgt_min <- df[, uniqueN(ID)]
    wgt_max <- df[, uniqueN(ID) * 3]
    
    freq_min <- 0
    freq_max <- df[, uniqueN(ID) * 2] #verify if this is the worst case scenario
    
    score <- function(solution){
      # compute raw scores
      current_wgt <- solution[, sum(wgt)]
      second_freq <- solution[, .(SecondCount = .N), by = Second]
      names(second_freq)[1] <- "Name"
      compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
      compare[is.na(compare)] <- 0
      compare[, diff := abs(FirstCount - SecondCount)]
      current_freq <- compare[, sum(diff)]
    
      # normalize
      wgt_score <- (current_wgt - wgt_min) / (wgt_max - wgt_min)
      freq_score <- (current_freq - freq_min) / (freq_max - freq_min)
    
      #combine
      score <- (freq_weight * freq_score) + ((1 - freq_weight) * wgt_score)
      return(score)
    }
    
    #initialize random solution
    current_solution <- df[, .SD[sample(.N, 1)], by = ID]
    
    #get freq of First (this does not change)
    First_freq <- current_solution[, .(FirstCount = .N), by = First]
    names(First_freq)[1] <- "Name"
    
    #get mincoflict to be applied on each iteration
    minconflict <- function(df, solution){
      #pick ID
      change <- solution[, sample(unique(ID), 1)]
    
      #get permissible values
      values <- df[ID == change, .(Second, wgt)]
    
      #assign scores
      values[, score := NA_real_]
      for (i in 1:nrow(values)) {
        solution[ID == change, c("Second", "wgt") := values[i, .(Second, wgt)]]
        set(values, i, "score", score(solution))
      }
    
      #return the best combination
      scores <<- c(scores, values[, min(score)])
      solution[ID == change, c("Second", "wgt") := values[which.min(score), .(Second, wgt)]]
    }
    
    #optimize
    scores <- 1
    iter <- 0
    while(TRUE){
      minconflict(df, current_solution)
      iter <- iter + 1
      #SET MAX NUMBER OF ITERATIONS HERE
      if(scores[length(scores)] == 0 | iter >= 1000) break
    }
    
    # summarize obtained solution
    solution_summary <- function(solution){
      second_freq <- solution[, .(SecondCount = .N), by = Second]
      names(second_freq)[1] <- "Name"
      compare <- merge(First_freq, second_freq, by = "Name", all = TRUE)
      compare[is.na(compare)] <- 0
      compare[, diff := abs(FirstCount - SecondCount)]
      print(compare)
      print(paste("Total freq diff: ", compare[, sum(diff)]))
      print(paste("Total wgt: ", solution[, sum(wgt)]))
    }
    solution_summary(current_solution)
    
        2
  •  1
  •   kuppern87    6 年前

    这基本上是一个二部图匹配问题,因此可以通过maxflow或线性规划在合理的时间内精确求解。( bipartite graph matching to match two sets )中。

    library(lpSolve)
    MISMATCH.COST <- 1000
    
    .create.row <- function(row.names, first) {
        row <- vector(mode="numeric", length=length(first))
        for (i in 1:length(row.names))
            row = row + (-MISMATCH.COST+i)*(row.names[i]==first)
        return(row)
    }
    
    find.pairing <- function(First, Second) {
        row.names = sapply(Second, strsplit, "; ")
    
        # Create cost matrix for assignment
        mat = sapply(row.names, .create.row, First)
        assignment <- lp.assign(mat)
        print("Total cost:")
        print(assignment$objval+length(First)*MISMATCH.COST)
        solution <- lp.assign(mat)$solution
        pairs <- which(solution>0, arr.ind=T)
        matches = First[pairs[,1]]
    
        # Find out where a mismatch has occured, and replace match
        for (i in 1:length(matches)) {
            if (!(matches[i] %in% row.names[[i]])) {
                matches[i] = row.names[[i]][1]
            }
        }
        result = data.frame(
            First[pairs[,2]],
            matches)
        return(result)
    }
    

    在你的例子中运行它会给出一个最佳的解决方案(它应该一直这样做)

    > First = c("MW3", "MW3", "MW4", "MW5", "MW6", "MW7", "MW7", "MW8")
    > Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
               "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
               "MW3; MW4; MW5", "MW6; MW3; MW7")
    Second = c("MW4; MW5; MW6", "MW5; MW3; MW7", "MW8; MW7; MW3",
    +            "MW5; MW6; MW4", "MW3; MW7; MW8", "MW6; MW8; MW4",
    +            "MW3; MW4; MW5", "MW6; MW3; MW7")
    > find.pairing(First, Second)
    [1] "Total cost:"
    [1] 12
      First.pairs...2.. matches
    1               MW3     MW4
    2               MW3     MW3
    3               MW4     MW7
    4               MW5     MW5
    5               MW6     MW7
    6               MW7     MW8
    7               MW7     MW3
    8               MW8     MW6