代码之家  ›  专栏  ›  技术社区  ›  Catalyst

df1和df2之间的反连接,但如何将df2中的所有不匹配更改为NA

  •  2
  • Catalyst  · 技术社区  · 2 年前

    下面是我的两个数据帧,df1和df2

    df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),response=c("y","y","y","n","n","y","y","n","n","y"))
    
    id     text response
    1  632592651     asdf        y
    2  633322173      cat        y
    3  634703802      dog        y
    4  634927873    mouse        n
    5  635812953 elephant        n
    6  636004739    goose        y
    7  636101211      rat        y
    8  636157799     mice        n
    9  636263106    kitty        n
    10 636752420   kitten        y
    
    df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                      text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                      volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))
    
     id           text volume
    1   632592651       asdf_xyz   1234
    2   633322173            cat    432
    3   634703802            dog    324
    4   634927873          mouse    333
    5   635812953       elephant   2223
    6   636004739          goose 412346
    7   636101211            rat   7456
    8   636157799           mice   3456
    9   636263106          kitty   2345
    10  636752420         kitten   2345
    11  636809222      tiger_xyz      6
    12 2004722036           lion    345
    13 2004894388        leopard     23
    14 2005045755        ostrich      2
    15 2005535472       kangaroo   4778
    16 2005630542       platypus    234
    17 2005788781           fish   8675
    18 2005809679        reptile   3459
    19 2005838317        mammals      8
    20 2005866692 amphibians_xyz      9
    

    如何将不匹配项从df2的id1:20行更改为NA(即所有项都与df1不匹配),并将id1的“text”(即asdf\u xyz)列更改为NA?

    我试过了

    library(dplyr)
    
    df3 <- df2 %>%
      anti_join(df1, by=c("id"))
    
    id           text volume
    1   636809222      tiger_xyz      6
    2  2004722036           lion    345
    3  2004894388        leopard     23
    4  2005045755        ostrich      2
    5  2005535472       kangaroo   4778
    6  2005630542       platypus    234
    7  2005788781           fish   8675
    8  2005809679        reptile   3459
    9  2005838317        mammals      8
    10 2005866692 amphibians_xyz      9
    
    df3$id[df3$id != 0] <- NA
    df3$text[df3$text != 0] <- NA
    df3$volume[df3$volume != 0] <- NA
    

    (一个接一个地这样做,因为我找不到如何将数据帧的整个值更改为NA的解决方案)

    id text volume
    1  <NA> <NA>   <NA>
    2  <NA> <NA>   <NA>
    3  <NA> <NA>   <NA>
    4  <NA> <NA>   <NA>
    5  <NA> <NA>   <NA>
    6  <NA> <NA>   <NA>
    7  <NA> <NA>   <NA>
    8  <NA> <NA>   <NA>
    9  <NA> <NA>   <NA>
    10 <NA> <NA>   <NA>
    

    和df4(来自 How to return row values that match column 'id' in both df1 and df2 but not column 'text' and return NA to the mismatch in column 'text'? )

    inner_join(x = df1, 
               y = df2, 
               by = "id") %>%
      mutate_if(is.factor, as.character) %>%
      mutate(text = ifelse(test = text.x != text.y, 
                           yes = NA, 
                           no = text.x)) %>%
      select(id, text, response, volume)
    
    id     text response volume
    1  632592651     <NA>        y   1234
    2  633322173      cat        y    432
    3  634703802      dog        y    324
    4  634927873    mouse        n    333
    5  635812953 elephant        n   2223
    6  636004739    goose        y 412346
    7  636101211      rat        y   7456
    8  636157799     mice        n   3456
    9  636263106    kitty        n   2345
    10 636752420   kitten        y   2345
    

    但不确定如何用df3和df4替换df2。所需输出如下所示:

    id           text volume
    1   632592651       NA   1234
    2   633322173            cat    432
    3   634703802            dog    324
    4   634927873          mouse    333
    5   635812953       elephant   2223
    6   636004739          goose 412346
    7   636101211            rat   7456
    8   636157799           mice   3456
    9   636263106          kitty   2345
    10  636752420         kitten   2345
    11  NA               NA      NA
    12  NA               NA      NA
    13  NA               NA      NA
    14  NA               NA      NA
    15  NA               NA      NA
    16  NA               NA      NA
    17  NA               NA      NA
    18  NA               NA      NA
    19  NA               NA      NA
    20  NA               NA      NA
    

    有人能帮忙吗? 如果可能的话,我还可以知道是否有手动方法根据df3$id选择df2的子集并将所有值更改为NA吗?

    2 回复  |  直到 2 年前
        1
  •  2
  •   jared_mamrot    2 年前

    编辑

    我重读了你的问题,意识到我错过了关于冲突的那部分;处理冲突的一个潜在解决方案是使用 powerjoin package ,例如。

    library(tidyverse)
    
    df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                      text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                      response=c("y","y","y","n","n","y","y","n","n","y"))
    
    df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                      text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                      volume=c(1234,432,324,333,2223,412346,7456,3456,2345,2345,6,345,23,2,4778,234,8675,3459,8,9))
    
    expected_outcome <- data.frame(id = c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420",
                                NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                        text = c(NA, "cat", "dog", "mouse", "elephant", "goose", 
                                 "rat", "mice", "kitty", "kitten", 
                                 NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), 
                        volume = c(1234, 432, 324, 333, 2223, 412346, 7456, 
                                   3456, 2345, 2345, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA))
    
    library(powerjoin)
    joined_df <- power_full_join(df1, df2, by = c("id"),
                                 conflict = rw ~ ifelse(.x != .y,
                                                        NA_integer_, 
                                                        .x))
    
    final_df <- joined_df %>%
      mutate(across(everything(), ~ifelse(is.na(response), NA, .x))) %>%
      select(id, text, volume)
    final_df
    #>           id     text volume
    #> 1  632592651     <NA>   1234
    #> 2  633322173      cat    432
    #> 3  634703802      dog    324
    #> 4  634927873    mouse    333
    #> 5  635812953 elephant   2223
    #> 6  636004739    goose 412346
    #> 7  636101211      rat   7456
    #> 8  636157799     mice   3456
    #> 9  636263106    kitty   2345
    #> 10 636752420   kitten   2345
    #> 11      <NA>     <NA>     NA
    #> 12      <NA>     <NA>     NA
    #> 13      <NA>     <NA>     NA
    #> 14      <NA>     <NA>     NA
    #> 15      <NA>     <NA>     NA
    #> 16      <NA>     <NA>     NA
    #> 17      <NA>     <NA>     NA
    #> 18      <NA>     <NA>     NA
    #> 19      <NA>     <NA>     NA
    #> 20      <NA>     <NA>     NA
    
    all_equal(final_df, expected_outcome)
    #> [1] TRUE
    

    于2022-07-01由 reprex package (v2.0.1)

    原始答案

    这种方法能解决您的问题吗?

    library(tidyverse)
    
    df1 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420"),
                      text=c("asdf","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten"),
                      response=c("y","y","y","n","n","y","y","n","n","y"))
    
    df2 <- data.frame(id=c("632592651","633322173","634703802","634927873","635812953","636004739","636101211","636157799","636263106","636752420","636809222","2004722036","2004894388","2005045755","2005535472","2005630542","2005788781","2005809679","2005838317","2005866692"),
                      text=c("asdf_xyz","cat","dog","mouse","elephant","goose","rat","mice","kitty","kitten","tiger_xyz","lion","leopard","ostrich","kangaroo","platypus","fish","reptile","mammals","amphibians_xyz"),
                      volume=c("1234","432","324","333","2223","412346","7456","3456","2345","2345","6","345","23","2","4778","234","8675","3459","8","9"))
    
    final <- read.table(text = "id           text volume
    1   632592651       NA   1234
    2   633322173            cat    432
    3   634703802            dog    324
    4   634927873          mouse    333
    5   635812953       elephant   2223
    6   636004739          goose 412346
    7   636101211            rat   7456
    8   636157799           mice   3456
    9   636263106          kitty   2345
    10  636752420         kitten   2345
    11  NA               NA      NA
    12  NA               NA      NA
    13  NA               NA      NA
    14  NA               NA      NA
    15  NA               NA      NA
    16  NA               NA      NA
    17  NA               NA      NA
    18  NA               NA      NA
    19  NA               NA      NA
    20  NA               NA      NA", header = TRUE)
    
    
    df1 %>%
      full_join(df2) %>%
      mutate(across(c(1,2,4), ~ifelse(is.na(response), NA, .x)))
    #> Joining, by = c("id", "text")
    #>           id     text response volume
    #> 1  632592651     asdf        y   <NA>
    #> 2  633322173      cat        y    432
    #> 3  634703802      dog        y    324
    #> 4  634927873    mouse        n    333
    #> 5  635812953 elephant        n   2223
    #> 6  636004739    goose        y 412346
    #> 7  636101211      rat        y   7456
    #> 8  636157799     mice        n   3456
    #> 9  636263106    kitty        n   2345
    #> 10 636752420   kitten        y   2345
    #> 11      <NA>     <NA>     <NA>   <NA>
    #> 12      <NA>     <NA>     <NA>   <NA>
    #> 13      <NA>     <NA>     <NA>   <NA>
    #> 14      <NA>     <NA>     <NA>   <NA>
    #> 15      <NA>     <NA>     <NA>   <NA>
    #> 16      <NA>     <NA>     <NA>   <NA>
    #> 17      <NA>     <NA>     <NA>   <NA>
    #> 18      <NA>     <NA>     <NA>   <NA>
    #> 19      <NA>     <NA>     <NA>   <NA>
    #> 20      <NA>     <NA>     <NA>   <NA>
    #> 21      <NA>     <NA>     <NA>   <NA>
    

    于2022-07-01由 reprex包 (v2.0.1)

        2
  •  1
  •   thelatemail    2 年前

    数据桌子 使用 !antijoin ,并覆盖 := 返回的所有列/行 df2 带有 NA (回收清单 .(NA) 到所有列)。
    然后遍历所有公共变量并覆盖任何不匹配的值 id :

    library(data.table)
    setDT(df1)
    setDT(df2)
    
    df2[!df1, on=.(id), names(df2) := .(NA)]
    idvars <- "id"
    compvars <- setdiff(intersect(names(df1), names(df2)), idvars)
    for (i in compvars) {
        df2[!df1, on=c(idvars,i), (i) := NA]
    }
    
    #           id     text volume
    # 1: 632592651     <NA>   1234
    # 2: 633322173      cat    432
    # 3: 634703802      dog    324
    # 4: 634927873    mouse    333
    # 5: 635812953 elephant   2223
    # 6: 636004739    goose 412346
    # 7: 636101211      rat   7456
    # 8: 636157799     mice   3456
    # 9: 636263106    kitty   2345
    #10: 636752420   kitten   2345
    #11:      <NA>     <NA>   <NA>
    #12:      <NA>     <NA>   <NA>
    #13:      <NA>     <NA>   <NA>
    #14:      <NA>     <NA>   <NA>
    #15:      <NA>     <NA>   <NA>
    #16:      <NA>     <NA>   <NA>
    #17:      <NA>     <NA>   <NA>
    #18:      <NA>     <NA>   <NA>
    #19:      <NA>     <NA>   <NA>
    #20:      <NA>     <NA>   <NA>