代码之家  ›  专栏  ›  技术社区  ›  alleyway

基于另一列的每个因素级别的比例数据框

  •  1
  • alleyway  · 技术社区  · 6 年前

    我想按月总结一个数据框架,其中每一列是基于下面数据框架中记录列的每个因素水平的比例。我一直在尝试使用dplyr,但还没有完全弄明白。

    library(dplyr)
    set.seed(100)
    df=data.frame(Month=rep(c("1/1/2017","2/1/2017","3/1/2017","4/1/2017","5/1/2017","6/1/2017","7/1/2017",
    
                              "8/1/2017","9/1/2017","10/1/2017","11/1/2017","12/1/2017"),10),
    
                  Records=round(runif(120,6000,10000),0),
    
                  V1=as.factor(sample(c("T","F"),120,replace=TRUE)),
    
                  V2=as.factor(sample(c("A","B","C"),120,replace=TRUE)),
    
                  V3=as.factor(sample(c("X","Y","Z","W"),120,replace=TRUE)),
    
                  V4=as.factor(sample(c("YES","NO","Maybe"),120,replace=TRUE)))
    

    > dput((resultsdf))
    structure(list(Month = c("1/1/2017", "2/1/2017", "3/1/2017", 
    "4/1/2017", "5/1/2017", "6/1/2017", "7/1/2017", "8/1/2017", "9/1/2017", 
    "10/1/2017", "11/1/2017", "12/1/2017"), V1.F = c(0.4, 0.71, 0.63, 
    0.35, 0.37, 0.41, 0.37, 0.61, 0.29, 0.5, 0.38, 0.82), V2.T = c(0.6, 
    0.29, 0.37, 0.65, 0.63, 0.59, 0.63, 0.39, 0.71, 0.5, 0.62, 0.18
    ), V2.A = c(0.2, 0.28, 0.3, 0.31, 0.29, 0.3, 0.32, 0.45, 0.1, 
    0.41, 0.3, 0.11), V2.B = c(0.59, 0.33, 0.19, 0.5, 0.51, 0.19, 
    0.59, 0.22, 0.77, 0.2, 0.41, 0.16), V2.C = c(0.22, 0.38, 0.51, 
    0.19, 0.21, 0.51, 0.09, 0.32, 0.12, 0.39, 0.29, 0.73), V3.W = c(0.42, 
    0.11, 0, 0.21, 0.23, 0.3, 0.12, 0.45, 0.32, 0.28, 0.19, 0.19), 
        V3.X = c(0.19, 0.32, 0.18, 0.19, 0.19, 0.11, 0.19, 0, 0.27, 
        0.11, 0.23, 0.19), V3.Y = c(0.3, 0.29, 0.39, 0.4, 0.18, 0.4, 
        0.62, 0.34, 0.21, 0.33, 0.21, 0.1), V3.Z = c(0.09, 0.28, 
        0.43, 0.2, 0.4, 0.19, 0.07, 0.2, 0.2, 0.29, 0.38, 0.53), 
        V4.Maybe = c(0.4, 0.23, 0.39, 0.38, 0.62, 0.5, 0.2, 0.4, 
        0.4, 0.32, 0.3, 0.49), V4.NO = c(0.32, 0.5, 0.39, 0.31, 0.18, 
        0.29, 0.22, 0.42, 0.29, 0.3, 0.44, 0.3), V4.YES = c(0.28, 
        0.27, 0.22, 0.31, 0.2, 0.21, 0.58, 0.18, 0.3, 0.39, 0.26, 
        0.22)), row.names = c(NA, -12L), class = c("tbl_df", "tbl", 
    "data.frame"), spec = structure(list(cols = list(Month = structure(list(), class = c("collector_character", 
    "collector")), V1.F = structure(list(), class = c("collector_double", 
    "collector")), V2.T = structure(list(), class = c("collector_double", 
    "collector")), V2.A = structure(list(), class = c("collector_double", 
    "collector")), V2.B = structure(list(), class = c("collector_double", 
    "collector")), V2.C = structure(list(), class = c("collector_double", 
    "collector")), V3.W = structure(list(), class = c("collector_double", 
    "collector")), V3.X = structure(list(), class = c("collector_double", 
    "collector")), V3.Y = structure(list(), class = c("collector_double", 
    "collector")), V3.Z = structure(list(), class = c("collector_double", 
    "collector")), V4.Maybe = structure(list(), class = c("collector_double", 
    "collector")), V4.NO = structure(list(), class = c("collector_double", 
    "collector")), V4.YES = structure(list(), class = c("collector_double", 
    "collector"))), default = structure(list(), class = c("collector_guess", 
    "collector"))), class = "col_spec"))
    
    2 回复  |  直到 6 年前
        1
  •  0
  •   Maurits Evers    6 年前

    请检查您的预期输出。我相信有些错误。

    tidyverse 选项:

    library(tidyverse)
    df %>%
        gather(key, value, -Month, -Records) %>%
        group_by(Month, key, value) %>%
        summarise(freq = n()) %>%
        mutate(freq = freq / sum(freq)) %>%
        unite(col, key, value, sep = ".") %>%
        spread(col, freq)
    ## A tibble: 12 x 13
    ## Groups:   Month [12]
    #   Month  V1.F  V1.T  V2.A  V2.B  V2.C  V3.W  V3.X  V3.Y  V3.Z V4.Maybe V4.NO
    #   <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl> <dbl>
    # 1 1/1/…   0.4   0.6   0.2   0.6   0.2   0.4   0.2   0.3   0.1      0.4   0.3
    # 2 10/1…   0.5   0.5   0.4   0.2   0.4   0.3   0.1   0.3   0.3      0.3   0.3
    # 3 11/1…   0.4   0.6   0.3   0.4   0.3   0.2   0.2   0.2   0.4      0.3   0.4
    # 4 12/1…   0.8   0.2   0.1   0.2   0.7   0.2   0.2   0.1   0.5      0.5   0.3
    # 5 2/1/…   0.7   0.3   0.3   0.3   0.4   0.1   0.3   0.3   0.3      0.2   0.5
    # 6 3/1/…   0.6   0.4   0.3   0.2   0.5  NA     0.2   0.4   0.4      0.4   0.4
    # 7 4/1/…   0.4   0.6   0.3   0.5   0.2   0.2   0.2   0.4   0.2      0.4   0.3
    # 8 5/1/…   0.4   0.6   0.3   0.5   0.2   0.2   0.2   0.2   0.4      0.6   0.2
    # 9 6/1/…   0.4   0.6   0.3   0.2   0.5   0.3   0.1   0.4   0.2      0.5   0.3
    #10 7/1/…   0.4   0.6   0.3   0.6   0.1   0.1   0.2   0.6   0.1      0.2   0.2
    #11 8/1/…   0.6   0.4   0.5   0.2   0.3   0.5  NA     0.3   0.2      0.4   0.4
    #12 9/1/…   0.3   0.7   0.1   0.8   0.1   0.3   0.3   0.2   0.2      0.4   0.3
    ## ... with 1 more variable: V4.YES <dbl>
    
        2
  •  0
  •   Uwe    6 年前

    table() prop.table() 基函数R和 dcast() 用于重塑为宽格式。不幸的是,我的英语不够流利 dplyr 所以我求助于 data.table

    library(data.table)
    library(magrittr)
    setDT(df)[, lapply(.SD, function(.x) table(.x) %>% prop.table %>% as.data.table) %>% 
        rbindlist(idcol = TRUE), .SDcols = V1:V4, by = Month] %>% 
      dcast(Month ~ .id + .x)
    
            Month V1_F V1_T V2_A V2_B V2_C V3_W V3_X V3_Y V3_Z V4_Maybe V4_NO V4_YES
     1:  1/1/2017  0.4  0.6  0.2  0.6  0.2  0.4  0.2  0.3  0.1      0.4   0.3    0.3
     2: 10/1/2017  0.5  0.5  0.4  0.2  0.4  0.3  0.1  0.3  0.3      0.3   0.3    0.4
     3: 11/1/2017  0.4  0.6  0.3  0.4  0.3  0.2  0.2  0.2  0.4      0.3   0.4    0.3
     4: 12/1/2017  0.8  0.2  0.1  0.2  0.7  0.2  0.2  0.1  0.5      0.5   0.3    0.2
     5:  2/1/2017  0.7  0.3  0.3  0.3  0.4  0.1  0.3  0.3  0.3      0.2   0.5    0.3
     6:  3/1/2017  0.6  0.4  0.3  0.2  0.5  0.0  0.2  0.4  0.4      0.4   0.4    0.2
     7:  4/1/2017  0.4  0.6  0.3  0.5  0.2  0.2  0.2  0.4  0.2      0.4   0.3    0.3
     8:  5/1/2017  0.4  0.6  0.3  0.5  0.2  0.2  0.2  0.2  0.4      0.6   0.2    0.2
     9:  6/1/2017  0.4  0.6  0.3  0.2  0.5  0.3  0.1  0.4  0.2      0.5   0.3    0.2
    10:  7/1/2017  0.4  0.6  0.3  0.6  0.1  0.1  0.2  0.6  0.1      0.2   0.2    0.6
    11:  8/1/2017  0.6  0.4  0.5  0.2  0.3  0.5  0.0  0.3  0.2      0.4   0.4    0.2
    12:  9/1/2017  0.3  0.7  0.1  0.8  0.1  0.3  0.3  0.2  0.2      0.4   0.3    0.3