代码之家  ›  专栏  ›  技术社区  ›  mckisa

应用group_by和SUMMARE(sum),但保留包含非相关冲突数据的列?

  •  10
  • mckisa  · 技术社区  · 7 年前

    我的问题与 Applying group_by and summarise on data while keeping all the columns' info

    Label <- c("203c","203c","204a","204a","204a","204a","204a","204a","204a","204a")
    Type <- c("wholefish","flesh","flesh","fleshdelip","formula","formuladelip",
              "formula","formuladelip","wholefish", "wholefishdelip")
    Proportion <- c(1,1,0.67714,0.67714,0.32285,0.32285,0.32285, 
                    0.32285, 0.67714,0.67714)
    N <- (1:10)
    C <- (1:10)
    Code <- c("c","a","a","b","a","b","c","d","c","d")
    
    df <- data.frame(Label,Type, Proportion, N, C, Code)
    df
    
       Label           Type Proportion  N  C Code
    1   203c      wholefish     1.0000  1  1    c
    2   203c          flesh     1.0000  2  2    a
    3   204a          flesh     0.6771  3  3    a
    4   204a     fleshdelip     0.6771  4  4    b
    5   204a        formula     0.3228  5  5    a
    6   204a   formuladelip     0.3228  6  6    b
    7   204a        formula     0.3228  7  7    c
    8   204a   formuladelip     0.3228  8  8    d
    9   204a      wholefish     0.6771  9  9    c
    10  204a wholefishdelip     0.6771 10 10    d
    
    total <- df %>% 
      #where the Label and Code are the same the Proportion, N and C 
      #should be added together respectively
      group_by(Label, Code) %>% 
      #total proportion should add up to 1 
      #my way of checking that the correct task has been completed
      summarise_if(is.numeric, sum)
    
    # A tibble: 6 x 5
    # Groups:   Label [?]
       Label   Code Proportion     N     C
      <fctr> <fctr>      <dbl> <int> <int>
    1   203c      a    1.00000     2     2
    2   203c      c    1.00000     1     1
    3   204a      a    0.99999     8     8
    4   204a      b    0.99999    10    10
    5   204a      c    0.99999    16    16
    6   204a      d    0.99999    18    18
    

    直到这里,我得到了我想要的。现在我想包括列类型,尽管它被排除在外,因为值是冲突的。这是我想要的结果

    # A tibble: 6 x 5
    # Groups:   Label [?]
       Label   Code Proportion     N     C    Type
      <fctr> <fctr>      <dbl> <int> <int>  <fctr>
    1   203c      a    1.00000     2     2    wholefish
    2   203c      c    1.00000     1     1    flesh
    3   204a      a    0.99999     8     8    flesh_formula
    4   204a      b    0.99999    10    10    fleshdelip_formuladelip
    5   204a      c    0.99999    16    16    wholefish_formula
    6   204a      d    0.99999    18    18    wholefishdelip_formuladelip
    

    我试过了 ungroup() 以及 mutate unite 但如果没有任何效果,我们将不胜感激

    3 回复  |  直到 7 年前
        1
  •  8
  •   akuiper    7 年前

    以下是另外两个选项:

    1) 将列嵌套到一列中,然后通过检查数据类型自定义摘要:

    df %>% 
        group_by(Label, Code) %>% nest() %>% 
        mutate(data = map(data, 
            ~ as.tibble(map(.x, ~ if(is.numeric(.x)) sum(.x) else paste(.x, collapse="_")))
              )
        ) %>% unnest()
    
    # A tibble: 6 x 6
    #   Label   Code                        Type Proportion     N     C
    #  <fctr> <fctr>                       <chr>      <dbl> <int> <int>
    #1   203c      c                   wholefish    1.00000     1     1
    #2   203c      a                       flesh    1.00000     2     2
    #3   204a      a               flesh_formula    0.99999     8     8
    #4   204a      b     fleshdelip_formuladelip    0.99999    10    10
    #5   204a      c           formula_wholefish    0.99999    16    16
    #6   204a      d formuladelip_wholefishdelip    0.99999    18    18
    

    2) 单独总结,然后加入结果:

    numeric <- df %>% 
        group_by(Label, Code) %>% 
        summarise_if(is.numeric, sum)
    
    character <- df %>% 
        group_by(Label, Code) %>% 
        summarise_if(~ is.character(.) || is.factor(.), ~ paste(., collapse="_"))
    
    inner_join(numeric, character, by = c("Label", "Code"))
    # A tibble: 6 x 6
    # Groups:   Label [?]
    #   Label   Code Proportion     N     C                        Type
    #  <fctr> <fctr>      <dbl> <int> <int>                       <chr>
    #1   203c      a    1.00000     2     2                       flesh
    #2   203c      c    1.00000     1     1                   wholefish
    #3   204a      a    0.99999     8     8               flesh_formula
    #4   204a      b    0.99999    10    10     fleshdelip_formuladelip
    #5   204a      c    0.99999    16    16           formula_wholefish
    #6   204a      d    0.99999    18    18 formuladelip_wholefishdelip
    
        2
  •  6
  •   Jake Thompson    7 年前

    tidyverse公司 解决方案 group_by 声明相同。关键是使用 mutate_if 对于每个变量类型,首先(即数字、字符),然后获得不同的行。


    library(tidyverse)
    #> Loading tidyverse: ggplot2
    #> Loading tidyverse: tibble
    #> Loading tidyverse: tidyr
    #> Loading tidyverse: readr
    #> Loading tidyverse: purrr
    #> Loading tidyverse: dplyr
    #> Conflicts with tidy packages ----------------------------------------------
    #> filter(): dplyr, stats
    #> lag():    dplyr, stats
    
    Label <- c("203c", "203c", "204a", "204a", "204a", "204a", "204a", "204a",
      "204a", "204a")
    Type <- c("wholefish", "flesh", "flesh", "fleshdelip", "formula", "formuladelip",
      "formula", "formuladelip", "wholefish", "wholefishdelip")
    Proportion <- c(1, 1, 0.67714, 0.67714, 0.32285, 0.32285, 0.32285, 0.32285,
      0.67714, 0.67714)
    N <- (1:10)
    C <- (1:10)
    Code <- c("c", "a", "a", "b", "a", "b", "c", "d", "c", "d")
    
    df <- data_frame(Label, Type, Proportion, N, C, Code)
    df
    #> # A tibble: 10 x 6
    #>    Label           Type Proportion     N     C  Code
    #>    <chr>          <chr>      <dbl> <int> <int> <chr>
    #>  1  203c      wholefish    1.00000     1     1     c
    #>  2  203c          flesh    1.00000     2     2     a
    #>  3  204a          flesh    0.67714     3     3     a
    #>  4  204a     fleshdelip    0.67714     4     4     b
    #>  5  204a        formula    0.32285     5     5     a
    #>  6  204a   formuladelip    0.32285     6     6     b
    #>  7  204a        formula    0.32285     7     7     c
    #>  8  204a   formuladelip    0.32285     8     8     d
    #>  9  204a      wholefish    0.67714     9     9     c
    #> 10  204a wholefishdelip    0.67714    10    10     d
    
    df %>%
      group_by(Label, Code) %>%
      mutate_if(is.numeric, sum) %>%
      mutate_if(is.character, funs(paste(unique(.), collapse = "_"))) %>%
      distinct()
    #> # A tibble: 6 x 6
    #> # Groups:   Label, Code [6]
    #>   Label                        Type Proportion     N     C  Code
    #>   <chr>                       <chr>      <dbl> <int> <int> <chr>
    #> 1  203c                   wholefish    1.00000     1     1     c
    #> 2  203c                       flesh    1.00000     2     2     a
    #> 3  204a               flesh_formula    0.99999     8     8     a
    #> 4  204a     fleshdelip_formuladelip    0.99999    10    10     b
    #> 5  204a           formula_wholefish    0.99999    16    16     c
    #> 6  204a formuladelip_wholefishdelip    0.99999    18    18     d
    
        3
  •  2
  •   Mako212    7 年前

    这是 data.table 解决方案,我假设你想要 mean() 比例,因为这些分组比例可能不是相加的。

    setDT(df)
    
    df[, .(Type =paste(Type,collapse="_"), 
      Proportion=mean(Proportion),N= sum(N),C=sum(C)), by=.(Label,Code)]
      [order(Label)]
    
       Label Code                        Type Proportion  N  C
    1:  203c    c                   wholefish   1.000000  1  1
    2:  203c    a                       flesh   1.000000  2  2
    3:  204a    a               flesh_formula   0.499995  8  8
    4:  204a    b     fleshdelip_formuladelip   0.499995 10 10
    5:  204a    c           formula_wholefish   0.499995 16 16
    6:  204a    d formuladelip_wholefishdelip   0.499995 18 18
    

    dplyr 解决方案,但它有效:

    df %>% group_by(Label, Code) %>% 
      mutate(Type = paste(Type,collapse="_")) %>% 
      group_by(Label,Type,Code) %>% 
      summarise(N=sum(N),C=sum(C),Proportion=mean(Proportion))
    

    注意,这里的关键是在创建组合后重新分组 Type

       Label                        Type   Code     N     C Proportion
      <fctr>                       <chr> <fctr> <int> <int>      <dbl>
    1   203c                       flesh      a     2     2   1.000000
    2   203c                   wholefish      c     1     1   1.000000
    3   204a               flesh_formula      a     8     8   0.499995
    4   204a     fleshdelip_formuladelip      b    10    10   0.499995
    5   204a           formula_wholefish      c    16    16   0.499995
    6   204a formuladelip_wholefishdelip      d    18    18   0.499995