代码之家  ›  专栏  ›  技术社区  ›  Helen

R:加权加总

  •  0
  • Helen  · 技术社区  · 6 年前

    我在这张表上有一个数据集:

    set.seed(4561)    # Make the results reproducible
    
    df=data.frame(
    colour=rep(c("green","red","blue"),each=3),
    year=rep("2017",9),
    month=rep(c(1,2,3),3),
    price=c(200,254,188,450,434,490,100,99,97),
    work=ceiling(runif(9,30,60)),
    gain=ceiling(runif(9,1,10)),
    work_weighed_price=NA,
    gain_weighed_price=NA
    )
    

    对于每种颜色,年,月,我有一个价格(输出变量)和两个输入变量,称为收益和工作。实际上,我有更多的输入变量,但这足以表明我想用我的数据帧做什么。

    > df
      colour year month price work gain work_weighed_price gain_weighed_price
    1  green 2017     1   200   33    9                 NA                 NA
    2  green 2017     2   254   56    5                 NA                 NA
    3  green 2017     3   188   42    8                 NA                 NA
    4    red 2017     1   450   39    3                 NA                 NA
    5    red 2017     2   434   45    2                 NA                 NA
    6    red 2017     3   490   36    8                 NA                 NA
    7   blue 2017     1   100   50    8                 NA                 NA
    8   blue 2017     2    99   45    8                 NA                 NA
    9   blue 2017     3    97   56    4                 NA                 NA
    

    我想计算加权增益和功(以及加权价格),其中权重是该月和该年的价格除以各颜色的价格之和:

     desired_output=data.frame(
      year=rep("2017",3),
      month=rep(c(1,2,3),1),
      price=c(200*(200/(200+450+100))+450*(450/(200+450+100))+100*(100/(200+450+100)),
              254*(254/(254+434+99))+434*(434/(254+434+99))+99*(99/(254+434+99)),
              188*(188/(188+490+97))+490*(490/(188+490+97))+97*(97/(188+490+97))),
      work_weighed_price=c(47*(200/(200+450+100))+44*(450/(200+450+100))+52*(100/(200+450+100)),
                           44*(254/(254+434+99))+42*(434/(254+434+99))+32*(99/(254+434+99)),
                           38*(188/(188+490+97))+52*(490/(188+490+97))+52*(97/(188+490+97))) ,                                      
      gain_weighed_price=c(5*(200/(200+450+100))+8*(450/(200+450+100))+10*(100/(200+450+100)),
                           3*(254/(254+434+99))+7*(434/(254+434+99))+9*(99/(254+434+99)),
                           2*(188/(188+490+97))+4*(490/(188+490+97))+9*(97/(188+490+97)))
    )
    
    > desired_output
      year month    price work_weighed_price gain_weighed_price
    1 2017     1 336.6667           45.86667           7.466667
    2 2017     2 333.7649           41.38755           5.960610
    3 2017     3 367.5523           48.60387           4.140645
    

    我该如何在R中攻击它?

    3 回复  |  直到 6 年前
        1
  •  2
  •   IceCreamToucan    6 年前

    你可以使用 weighted.mean 功能

    df %>% 
      group_by(year, month) %>% 
      summarise_at(vars(price, work, gain), 
                   funs(price_weighted = weighted.mean(., price)))
    # # A tibble: 3 x 5
    # # Groups:   year [?]
    #    year month price_price_weighted work_price_weighted gain_price_weighted
    #    <int> <int>                <dbl>               <dbl>               <dbl>
    # 1  2017     1                  337                45.9                7.47
    # 2  2017     2                  334                41.4                5.96
    # 3  2017     3                  368                48.6                4.14
    

    或者,在 data.table

    library(data.table)
    setDT(df)
    
    df[, lapply(.SD, weighted.mean, price)
       , .SDcols = c('price', 'work', 'gain')
       , by = .(year, month)]
    
    #    year month    price     work     gain
    # 1: 2017     1 336.6667 45.86667 7.466667
    # 2: 2017     2 333.7649 41.38755 5.960610
    # 3: 2017     3 367.5523 48.60387 4.140645
    
        2
  •  2
  •   Weihuang Wong    6 年前

    使用 dplyr . 你的使用 runif 在你的例子中 df 如果没有设定种子,而且它与您想要的输出不一致,就会造成一些混乱。在下面的代码中,我使用 东风 这和你想要的结果是一致的。

    library(dplyr)
    df %>%
      group_by(year, month) %>%
      mutate(weight = price / sum(price)) %>%
      mutate_at(vars(price, work, gain), funs(weighed_price = . * weight)) %>%
      summarise_at(vars(ends_with("weighed_price")), sum)
    # # A tibble: 3 x 5
    # # Groups:   year [?]
    #    year month work_weighed_price gain_weighed_price price_weighed_price
    #   <int> <int>              <dbl>              <dbl>               <dbl>
    # 1  2017     1               45.9               7.47                337.
    # 2  2017     2               41.4               5.96                334.
    # 3  2017     3               48.6               4.14                368.
    

    东风 :

    structure(list(colour = c("green", "green", "green", "red", "red", 
    "red", "blue", "blue", "blue"), year = c(2017L, 2017L, 2017L, 
    2017L, 2017L, 2017L, 2017L, 2017L, 2017L), month = c(1L, 2L, 
    3L, 1L, 2L, 3L, 1L, 2L, 3L), price = c(200L, 254L, 188L, 450L, 
    434L, 490L, 100L, 99L, 97L), work = c(47L, 44L, 38L, 44L, 42L, 
    52L, 52L, 32L, 52L), gain = c(5L, 3L, 2L, 8L, 7L, 4L, 10L, 9L, 
    9L), work_weighed_price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA
    ), gain_weighed_price = c(NA, NA, NA, NA, NA, NA, NA, NA, NA)), .Names = c("colour", 
    "year", "month", "price", "work", "gain", "work_weighed_price", 
    "gain_weighed_price"), class = "data.frame", row.names = c(NA, 
    -9L))
    
        3
  •  2
  •   Rui Barradas    6 年前

    碱基R溶液可以是 tapply 说明。

    fun_price <- function(x){
        s <- sum(x)
        sum(x*(x/s))
    }
    
    fun_weighted <- function(x, w){
        s <- sum(w)
        sum(x*(w/s))
    }
    
    desired <- data.frame(year = unique(df$year), month = sort(unique(df$month)))
    
    desired$price <- with(df, tapply(price, month, FUN = fun_price))
    desired$work_weighed_price <- with(df, tapply(work, month, FUN = fun_weighted, w = price))
    desired$gain_weighed_price <- with(df, tapply(gain, month, FUN = fun_weighted, w = price))
    
    desired
    #  year month    price work_weighed_price gain_weighed_price
    #1 2017     1 336.6667           40.74092           6.622405
    #2 2017     2 333.7649           48.56834           4.984429
    #3 2017     3 367.5523           44.65052           6.659170