代码之家  ›  专栏  ›  技术社区  ›  user113156

提取列中包含NA值的单元格,并根据结果创建新列

  •  0
  • user113156  · 技术社区  · 2 年前

    我有如下数据:

         grp REGIONNAME RegionName `Año 2004_1` `Año 2004_2` `Año 2004_3`
       <int> <chr>      <chr>             <dbl>        <dbl>        <dbl>
     1     1 ANDALUCÍA  ANDALUCÍA         32143        37962        32374
     2     1 ANDALUCÍA  Almería              NA           NA           NA
     3     1 ANDALUCÍA  Abla                 58           61           54
     4     1 ANDALUCÍA  Abrucena              6            2            1
     5     1 ANDALUCÍA  Adra                146          211          101
     6     1 ANDALUCÍA  Albánchez            12            3            3
     7     1 ANDALUCÍA  Alboloduy             2            2            2
     8     1 ANDALUCÍA  Albox                33           66           35
     9     1 ANDALUCÍA  Alcolea               0            1            1
    10     1 ANDALUCÍA  Alcóntar              1            1            2
    

    在这个样本中,它包含2 NA 行,一个用于 Almeria 另一个用于 Balanegra

    我想创建一个新列 RegionName 比方说。这两个单元格将在何处填充。即预期输出将是:

         grp REGIONNAME RegionName    RegionName
       <int> <chr>      <chr>            <chr>
     1     1 ANDALUCÍA  ANDALUCÍA        ANDALUCIA/NA
     2     1 ANDALUCÍA  Almería            Almeria
     3     1 ANDALUCÍA  Abla               Almeria
     4     1 ANDALUCÍA  Abrucena           Almeria
     5     1 ANDALUCÍA  Adra               Almeria
     6     1 ANDALUCÍA  Albánchez          Almeria
     7     1 ANDALUCÍA  Alboloduy          Almeria
     8     1 ANDALUCÍA  Albox                ...
     9     1 ANDALUCÍA  Alcolea              ...
    10     1 ANDALUCÍA  Alcóntar             ...
                   ...............
      
     1     1 ANDALUCÍA  Bacares              ...
     2     1 ANDALUCÍA  Balanegra          Balanegra
     3     1 ANDALUCÍA  Bayárcal           Balanegra
     4     1 ANDALUCÍA  Bayarque           Balanegra
     5     1 ANDALUCÍA  Bédar              Balanegra
     6     1 ANDALUCÍA  Beires    
     7     1 ANDALUCÍA  Benahadux           ....
     8     1 ANDALUCÍA  Benitagla           ....
     9     1 ANDALUCÍA  Benizalón 
    10     1 ANDALUCÍA  Bentarique         Balanegra
    

    所以当它看到 NA 值,则表示一个新的“区域”。

    最后,我想 group_by 这个新创建的区域并计算 cumsum 以便填写 NA 价值观

    我做了一些与 REGIONNAME 当我想填写的NA值 ANDALUCIA

    ... %>%
      group_by(grp = cumsum(RegionName == toupper(RegionName))) %>%
      mutate(REGIONNAME = first(RegionName)) %>% 
      relocate(REGIONNAME, .before = RegionName) %>% 
      mutate(across(starts_with("Año"), 
                    ~ ifelse(REGIONNAME == RegionName, sum(.x[REGIONNAME != RegionName], na.rm = T), .x)))
    

    数据:

    df = structure(list(grp = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), REGIONNAME = c("ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", 
    "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", 
    "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", 
    "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", 
    "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", 
    "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", "ANDALUCÍA", 
    "ANDALUCÍA", "ANDALUCÍA"), RegionName = c("ANDALUCÍA", "Almería", 
    "Abla", "Abrucena", "Adra", "Albánchez", "Alboloduy", "Albox", 
    "Alcolea", "Alcóntar", "Alcudia de Monteagud", "Alhabia", "Alhama de Almería", 
    "Alicún", "Almería", "Almócita", "Alsodux", "Antas", "Arboleas", 
    "Armuña de Almanzora", "Bacares", "Balanegra", "Bayárcal", 
    "Bayarque", "Bédar", "Beires", "Benahadux", "Benitagla", "Benizalón", 
    "Bentarique"), `Año 2004_1` = c(32143, NA, 58, 6, 146, 12, 2, 
    33, 0, 1, 1, 1, 13, 0, 748, 0, 1, 6, 16, 0, 2, NA, 0, 0, 8, 0, 
    18, 1, 2, 0), `Año 2004_2` = c(37962, NA, 61, 2, 211, 3, 2, 
    66, 1, 1, 1, 0, 15, 1, 770, 0, 10, 12, 16, 0, 1, NA, 1, 0, 2, 
    0, 21, 0, 0, 0), `Año 2004_3` = c(32374, NA, 54, 1, 101, 3, 
    2, 35, 1, 2, 0, 0, 14, 0, 701, 0, 3, 26, 14, 0, 0, NA, 0, 3, 
    8, 0, 25, 0, 2, 0)), class = c("grouped_df", "tbl_df", "tbl", 
    "data.frame"), row.names = c(NA, -30L), groups = structure(list(
        grp = 1L, .rows = structure(list(1:30), ptype = integer(0), class = c("vctrs_list_of", 
        "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
    ), row.names = c(NA, -1L), .drop = TRUE))
    
    0 回复  |  直到 2 年前
        1
  •  2
  •   Maël    2 年前

    您可以使用 c_across fill :

    library(tidyverse)
    
    df %>% 
      rowwise() %>% 
      mutate(Region = case_when(all(is.na(c_across(starts_with("Año")))) ~ RegionName)) %>% 
      ungroup() %>% 
      fill(Region)
    
    # A tibble: 30 × 7
         grp REGIONNAME RegionName           `Año 2004_1` `Año 2004_2` `Año 2004_3` Region   
       <int> <chr>      <chr>                       <dbl>        <dbl>        <dbl> <chr>    
     1     1 ANDALUCÍA  ANDALUCÍA                   32143        37962        32374 NA       
     2     1 ANDALUCÍA  Almería                        NA           NA           NA Almería  
     3     1 ANDALUCÍA  Abla                           58           61           54 Almería  
     4     1 ANDALUCÍA  Abrucena                        6            2            1 Almería  
     5     1 ANDALUCÍA  Adra                          146          211          101 Almería  
     6     1 ANDALUCÍA  Albánchez                      12            3            3 Almería  
     7     1 ANDALUCÍA  Alboloduy                       2            2            2 Almería  
     8     1 ANDALUCÍA  Albox                          33           66           35 Almería  
     9     1 ANDALUCÍA  Alcolea                         0            1            1 Almería  
    10     1 ANDALUCÍA  Alcóntar                        1            1            2 Almería  
    11     1 ANDALUCÍA  Alcudia de Monteagud            1            1            0 Almería  
    12     1 ANDALUCÍA  Alhabia                         1            0            0 Almería  
    13     1 ANDALUCÍA  Alhama de Almería              13           15           14 Almería  
    14     1 ANDALUCÍA  Alicún                          0            1            0 Almería  
    15     1 ANDALUCÍA  Almería                       748          770          701 Almería  
    16     1 ANDALUCÍA  Almócita                        0            0            0 Almería  
    17     1 ANDALUCÍA  Alsodux                         1           10            3 Almería  
    18     1 ANDALUCÍA  Antas                           6           12           26 Almería  
    19     1 ANDALUCÍA  Arboleas                       16           16           14 Almería  
    20     1 ANDALUCÍA  Armuña de Almanzora             0            0            0 Almería  
    21     1 ANDALUCÍA  Bacares                         2            1            0 Almería  
    22     1 ANDALUCÍA  Balanegra                      NA           NA           NA Balanegra
    23     1 ANDALUCÍA  Bayárcal                        0            1            0 Balanegra
    24     1 ANDALUCÍA  Bayarque                        0            0            3 Balanegra
    25     1 ANDALUCÍA  Bédar                           8            2            8 Balanegra
    26     1 ANDALUCÍA  Beires                          0            0            0 Balanegra
    27     1 ANDALUCÍA  Benahadux                      18           21           25 Balanegra
    28     1 ANDALUCÍA  Benitagla                       1            0            0 Balanegra
    29     1 ANDALUCÍA  Benizalón                       2            0            2 Balanegra
    30     1 ANDALUCÍA  Bentarique                      0            0            0 Balanegra
    
        2
  •  1
  •   akrun    2 年前

    我们还可以使用 if_all

    library(dplyr)
    library(tidyr)
    df %>%
       ungroup %>%
       mutate(Region = case_when(if_all(starts_with("Año"), is.na) ~ RegionName)) %>% 
       group_by(grp) %>%
       fill(Region) %>%
       ungroup
    

    -输出

    # A tibble: 30 × 7
         grp REGIONNAME RegionName `Año 2004_1` `Año 2004_2` `Año 2004_3` Region 
       <int> <chr>      <chr>             <dbl>        <dbl>        <dbl> <chr>  
     1     1 ANDALUCÍA  ANDALUCÍA         32143        37962        32374 <NA>   
     2     1 ANDALUCÍA  Almería              NA           NA           NA Almería
     3     1 ANDALUCÍA  Abla                 58           61           54 Almería
     4     1 ANDALUCÍA  Abrucena              6            2            1 Almería
     5     1 ANDALUCÍA  Adra                146          211          101 Almería
     6     1 ANDALUCÍA  Albánchez            12            3            3 Almería
     7     1 ANDALUCÍA  Alboloduy             2            2            2 Almería
     8     1 ANDALUCÍA  Albox                33           66           35 Almería
     9     1 ANDALUCÍA  Alcolea               0            1            1 Almería
    10     1 ANDALUCÍA  Alcóntar              1            1            2 Almería
    # … with 20 more rows