我有如下数据:
grp REGIONNAME RegionName `Año 2004_1` `Año 2004_2` `Año 2004_3`
<int> <chr> <chr> <dbl> <dbl> <dbl>
1 1 ANDALUCÃA ANDALUCÃA 32143 37962 32374
2 1 ANDALUCÃA AlmerÃa NA NA NA
3 1 ANDALUCÃA Abla 58 61 54
4 1 ANDALUCÃA Abrucena 6 2 1
5 1 ANDALUCÃA Adra 146 211 101
6 1 ANDALUCÃA Albánchez 12 3 3
7 1 ANDALUCÃA Alboloduy 2 2 2
8 1 ANDALUCÃA Albox 33 66 35
9 1 ANDALUCÃA Alcolea 0 1 1
10 1 ANDALUCÃA Alcóntar 1 1 2
在这个样本中,它包含2
NA
行,一个用于
Almeria
另一个用于
Balanegra
。
我想创建一个新列
RegionName
比方说。这两个单元格将在何处填充。即预期输出将是:
grp REGIONNAME RegionName RegionName
<int> <chr> <chr> <chr>
1 1 ANDALUCÃA ANDALUCÃA ANDALUCIA/NA
2 1 ANDALUCÃA AlmerÃa Almeria
3 1 ANDALUCÃA Abla Almeria
4 1 ANDALUCÃA Abrucena Almeria
5 1 ANDALUCÃA Adra Almeria
6 1 ANDALUCÃA Albánchez Almeria
7 1 ANDALUCÃA Alboloduy Almeria
8 1 ANDALUCÃA Albox ...
9 1 ANDALUCÃA Alcolea ...
10 1 ANDALUCÃA Alcóntar ...
...............
1 1 ANDALUCÃA Bacares ...
2 1 ANDALUCÃA Balanegra Balanegra
3 1 ANDALUCÃA Bayárcal Balanegra
4 1 ANDALUCÃA Bayarque Balanegra
5 1 ANDALUCÃA Bédar Balanegra
6 1 ANDALUCÃA Beires
7 1 ANDALUCÃA Benahadux ....
8 1 ANDALUCÃA Benitagla ....
9 1 ANDALUCÃA Benizalón
10 1 ANDALUCÃA Bentarique Balanegra
所以当它看到
NA
值,则表示一个新的“区域”。
最后,我想
group_by
这个新创建的区域并计算
cumsum
以便填写
NA
价值观
我做了一些与
REGIONNAME
当我想填写的NA值
ANDALUCIA
。
... %>%
group_by(grp = cumsum(RegionName == toupper(RegionName))) %>%
mutate(REGIONNAME = first(RegionName)) %>%
relocate(REGIONNAME, .before = RegionName) %>%
mutate(across(starts_with("Año"),
~ ifelse(REGIONNAME == RegionName, sum(.x[REGIONNAME != RegionName], na.rm = T), .x)))
数据:
df = structure(list(grp = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), REGIONNAME = c("ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA",
"ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA",
"ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA",
"ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA",
"ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA",
"ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA", "ANDALUCÃA",
"ANDALUCÃA", "ANDALUCÃA"), RegionName = c("ANDALUCÃA", "AlmerÃa",
"Abla", "Abrucena", "Adra", "Albánchez", "Alboloduy", "Albox",
"Alcolea", "Alcóntar", "Alcudia de Monteagud", "Alhabia", "Alhama de AlmerÃa",
"Alicún", "AlmerÃa", "Almócita", "Alsodux", "Antas", "Arboleas",
"Armuña de Almanzora", "Bacares", "Balanegra", "Bayárcal",
"Bayarque", "Bédar", "Beires", "Benahadux", "Benitagla", "Benizalón",
"Bentarique"), `Año 2004_1` = c(32143, NA, 58, 6, 146, 12, 2,
33, 0, 1, 1, 1, 13, 0, 748, 0, 1, 6, 16, 0, 2, NA, 0, 0, 8, 0,
18, 1, 2, 0), `Año 2004_2` = c(37962, NA, 61, 2, 211, 3, 2,
66, 1, 1, 1, 0, 15, 1, 770, 0, 10, 12, 16, 0, 1, NA, 1, 0, 2,
0, 21, 0, 0, 0), `Año 2004_3` = c(32374, NA, 54, 1, 101, 3,
2, 35, 1, 2, 0, 0, 14, 0, 701, 0, 3, 26, 14, 0, 0, NA, 0, 3,
8, 0, 25, 0, 2, 0)), class = c("grouped_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -30L), groups = structure(list(
grp = 1L, .rows = structure(list(1:30), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -1L), .drop = TRUE))