代码之家  ›  专栏  ›  技术社区  ›  Prevost

正确使用不适用近似值在数据表和管道

  •  0
  • Prevost  · 技术社区  · 6 年前

    我是新来的 data.table 我想试试看是否能让我的分析速度更快。我主要使用 knitr 编写 .rnw

    我在下面贴了一个样本,这绝不是一个比较的问题 data.table data.frame . 我想知道,如果我我下面的代码是什么,它应该是。

    data.tables 然后需要使用 na.approx 丢失的 NA Introduction to data.table vignette 从克兰和 JOINing data in R using data.table 从R酒吧。

    下面我使用的代码是我在 数据表

    还有,如果有人知道有没有办法 na.approx() 数据框 我们将不胜感激。注意 df_merged = as.data.frame(df_merged) 如果可能的话,我想去掉的线!

    library(data.table)
    library(zoo)
    library(dplyr)
    
    dt_function_test = function() {
        set.seed(123)
        # data.table
        dt_random = data.table(vals = runif(1E5, 0, 500))
        dt_na = data.table(vals = c(0, 250, 500),
                           ref1 = c(0.33, 0.45, 0.78),
                           ref2 = c(0.12, 0.79, 1))
    
        dt_merged = merge(dt_random[],
                          dt_na[],
                          all = TRUE)
    
        dt_merged = dt_merged[, lapply(.SD,
                                       na.approx),
                              by = vals]
    }
    
    
    df_function_test = function() {
        set.seed(123)
        # data.frame
        df_random = data.frame(vals = runif(1E5, 0, 500))
        df_na = data.frame(vals = c(0, 250, 500),
                           ref1 = c(0.33, 0.45, 0.78),
                           ref2 = c(0.12, 0.79, 1))
    
        df_merged = full_join(df_random,
                              df_na) %>% 
            na.approx
    
        df_merged = as.data.frame(df_merged)
    }
    
    print(system.time(dt_function_test()))
    #  user  system elapsed 
    # 11.42    0.00   11.46 
    
    print(system.time(df_function_test()))
    # Joining, by = "vals"
    #    user  system elapsed  
    #    0.05    0.05    0.10 
    
    1 回复  |  直到 6 年前
        1
  •  3
  •   chinsoon12    6 年前

    下面是使用 data.table zoo::na.approx ref* 列(请注意,还使用了较大的数据集):

    library(data.table)
    library(zoo)
    
    dt_function_test_0 = function() {
        set.seed(123)
        # data.table
        dt_random = data.table(vals = runif(1e7, 0, 500))
        dt_na = data.table(vals = c(0, 250, 500),
            ref1 = c(0.33, 0.45, 0.78),
            ref2 = c(0.12, 0.79, 1))
    
        cols <- c("ref1", "ref2")
    
        ##Version 0
        merge(dt_random, dt_na, all=TRUE)[, lapply(.SD, na.approx)]
    }
    
    
    dt_function_test_1 = function() {
        set.seed(123)
        # data.table
        dt_random = data.table(vals = runif(1e7, 0, 500))
        dt_na = data.table(vals = c(0, 250, 500),
            ref1 = c(0.33, 0.45, 0.78),
            ref2 = c(0.12, 0.79, 1))
    
        cols <- c("ref1", "ref2")
    
        ##Version 1: using update by reference
        merge(dt_random, dt_na, all = TRUE)[, 
            (cols) := lapply(.SD, na.approx), .SDcols=cols]
    }
    
    
    dt_function_test_2 = function() {
        set.seed(123)
        # data.table
        dt_random = data.table(vals = runif(1e7, 0, 500))
        dt_na = data.table(vals = c(0, 250, 500),
            ref1 = c(0.33, 0.45, 0.78),
            ref2 = c(0.12, 0.79, 1))
    
        cols <- c("ref1", "ref2")
        ##Version 2: using set
        dt_merged <- merge(dt_random, dt_na, all = TRUE)
        for (x in cols)
            set(dt_merged, j=x, value=na.approx(dt_merged[[x]]))
        dt_merged
    }
    

    定时输出:

    > system.time(dt_function_test_0())
       user  system elapsed 
       5.44    1.90    6.96 
    
    > system.time(dt_function_test_1())
       user  system elapsed 
       3.55    1.30    4.41 
    
    > system.time(dt_function_test_2())
       user  system elapsed 
       3.78    1.19    4.52