代码之家  ›  专栏  ›  技术社区  ›  SteveS

是否清除在R中作为空/空/NA变量包含列表的列?

  •  0
  • SteveS  · 技术社区  · 6 年前

    给定以下以列表作为值的数据帧:

    df <- structure(list(keys.userId = c("9875", "5465", 
    "1234", "4567", "8910"), user_data.SSIDs = list(
        c("qjJf5iZtYboSPvqe1oa/xg==", "ul7kroLEB2cZx6AMGhjnrA==", 
        "OYRT/hYu1Dl3/S5WIWyLHA==", "HFiSH/Tu0RSaQgIbDEZfeA==", "gUBxBfxjGdyPNzqYX7t6nA==", 
        "m7UqzqaXUm1GkXMbxf+SJw==", "KjxvZwsVCNSTtXXKiidmjw==", "3UOqe+4qPVZYXvja8GBEqg==", 
        "a9Ba8b19tY/bprM7WA326A==", "uStr9Fg+JlU9B+hdBCafZg==", "i9J11W00HFmoeCDObOfSdA==", 
        "A9vOz8zSrwDiQcKv8hk64Q==", "/8QeMoqFwd/eJ+/6NKk1iQ==", "HbyJqQxUfH6oiW3skqPzGQ==", 
        "51H9RcZmdRgkgg4X6U/mhQ==", "dwat86ppe1b/WXSaGi8r3w==", "yBCbMedxtZdiGFXmTfk2eQ==", 
        "wKSIsw3sDPRQhLIhdQkBJw==", "3LkFUenHOXWL4Be5T4XmaQ==", "Krni6eGQUnZuL/jU0MzKNA==", 
        "Wt9BCH4guyC4oSIHwE8XGA==", "rbgxp/3YPdHiownOdZHf+A==", "34rmNRgT/xFDXIDwHKIY6Q==", 
        "pT3zFcGdlJKmR+khJLMoVw==", "eni3X9I2B4KRK+sho2MbjA==", "sxuba/1Brg4CrYL8AFv8ZQ==", 
        "EYIxPOXPVvop99YD0vjXPA==", "JC1xqrtmQEaohwzviYDFYA==", "qv+cfEEqsIGrDFuEqpkQuw==", 
        "d3xXMR1RDKZdrDwQd97kNQ==", "qU1JULumBTqw+m/rLr4E7A==", "teDCJvNdyjktWD6leDpCmw==", 
        "ytSBHvzbEACq56aEHZlXEw==", "eV7WGimPD01weRI19ojO3g==", "vNkJyD9KOzOprGkYyfViMA==", 
        "r8jjZXWyax7JPfJUPFwRTQ==", "rJ1N3ONwDBK+jwFf+7xeHg==", "2xPbTqIww1KI/tVL2UH1cw==", 
        "1hk1AOU4DZXV52Auyr2FHA==", "aNH8uS5nrlwcHb8rLdZeXQ==", "5JPQs2z4N1Dru0dGI9ImBQ==", 
        "nbQIn5G4uyl8b1+A6aVkQg==", "A/UcwEccakKDuiATgoP1NA==", "JC57Ib2V7fOU/CgBk2R41g==", 
        "PsI8Ys++JveA+SuafbB8pg==", "eXiuBymYN+tcbjtpM9Vxmg==", "jEdP3Rs02d/4UE8G1GeE3A=="
        ), NULL, c("qjJf5iZtYboSPvqe1oa/xg==", "ul7kroLEB2cZx6AMGhjnrA==", 
        "OYRT/hYu1Dl3/S5WIWyLHA==", "HFiSH/Tu0RSaQgIbDEZfeA==", "gUBxBfxjGdyPNzqYX7t6nA==", 
        "m7UqzqaXUm1GkXMbxf+SJw==", "KjxvZwsVCNSTtXXKiidmjw==", "3UOqe+4qPVZYXvja8GBEqg==", 
        "a9Ba8b19tY/bprM7WA326A==", "uStr9Fg+JlU9B+hdBCafZg==", "i9J11W00HFmoeCDObOfSdA==", 
        "A9vOz8zSrwDiQcKv8hk64Q==", "/8QeMoqFwd/eJ+/6NKk1iQ==", "HbyJqQxUfH6oiW3skqPzGQ==", 
        "51H9RcZmdRgkgg4X6U/mhQ==", "dwat86ppe1b/WXSaGi8r3w==", "yBCbMedxtZdiGFXmTfk2eQ==", 
        "wKSIsw3sDPRQhLIhdQkBJw==", "3LkFUenHOXWL4Be5T4XmaQ==", "Krni6eGQUnZuL/jU0MzKNA==", 
        "Wt9BCH4guyC4oSIHwE8XGA==", "rbgxp/3YPdHiownOdZHf+A==", "34rmNRgT/xFDXIDwHKIY6Q==", 
        "pT3zFcGdlJKmR+khJLMoVw==", "eni3X9I2B4KRK+sho2MbjA==", "sxuba/1Brg4CrYL8AFv8ZQ==", 
        "EYIxPOXPVvop99YD0vjXPA==", "JC1xqrtmQEaohwzviYDFYA==", "qv+cfEEqsIGrDFuEqpkQuw==", 
        "d3xXMR1RDKZdrDwQd97kNQ==", "qU1JULumBTqw+m/rLr4E7A==", "teDCJvNdyjktWD6leDpCmw==", 
        "ytSBHvzbEACq56aEHZlXEw==", "eV7WGimPD01weRI19ojO3g==", "vNkJyD9KOzOprGkYyfViMA==", 
        "r8jjZXWyax7JPfJUPFwRTQ==", "rJ1N3ONwDBK+jwFf+7xeHg==", "2xPbTqIww1KI/tVL2UH1cw==", 
        "1hk1AOU4DZXV52Auyr2FHA==", "aNH8uS5nrlwcHb8rLdZeXQ==", "5JPQs2z4N1Dru0dGI9ImBQ==", 
        "nbQIn5G4uyl8b1+A6aVkQg==", "A/UcwEccakKDuiATgoP1NA==", "JC57Ib2V7fOU/CgBk2R41g==", 
        "PsI8Ys++JveA+SuafbB8pg==", "eXiuBymYN+tcbjtpM9Vxmg==", "jEdP3Rs02d/4UE8G1GeE3A=="
        ), NULL, c("qjJf5iZtYboSPvqe1oa/xg==", "ul7kroLEB2cZx6AMGhjnrA==", 
        "OYRT/hYu1Dl3/S5WIWyLHA==", "HFiSH/Tu0RSaQgIbDEZfeA==", "gUBxBfxjGdyPNzqYX7t6nA==", 
        "m7UqzqaXUm1GkXMbxf+SJw==", "KjxvZwsVCNSTtXXKiidmjw==", "3UOqe+4qPVZYXvja8GBEqg==", 
        "a9Ba8b19tY/bprM7WA326A==", "uStr9Fg+JlU9B+hdBCafZg==", "i9J11W00HFmoeCDObOfSdA==", 
        "A9vOz8zSrwDiQcKv8hk64Q==", "/8QeMoqFwd/eJ+/6NKk1iQ==", "HbyJqQxUfH6oiW3skqPzGQ==", 
        "51H9RcZmdRgkgg4X6U/mhQ==", "dwat86ppe1b/WXSaGi8r3w==", "yBCbMedxtZdiGFXmTfk2eQ==", 
        "wKSIsw3sDPRQhLIhdQkBJw==", "3LkFUenHOXWL4Be5T4XmaQ==", "Krni6eGQUnZuL/jU0MzKNA==", 
        "Wt9BCH4guyC4oSIHwE8XGA==", "rbgxp/3YPdHiownOdZHf+A==", "34rmNRgT/xFDXIDwHKIY6Q==", 
        "pT3zFcGdlJKmR+khJLMoVw==", "eni3X9I2B4KRK+sho2MbjA==", "sxuba/1Brg4CrYL8AFv8ZQ==", 
        "EYIxPOXPVvop99YD0vjXPA==", "JC1xqrtmQEaohwzviYDFYA==", "qv+cfEEqsIGrDFuEqpkQuw==", 
        "d3xXMR1RDKZdrDwQd97kNQ==", "qU1JULumBTqw+m/rLr4E7A==", "teDCJvNdyjktWD6leDpCmw==", 
        "ytSBHvzbEACq56aEHZlXEw==", "eV7WGimPD01weRI19ojO3g==", "vNkJyD9KOzOprGkYyfViMA==", 
        "r8jjZXWyax7JPfJUPFwRTQ==", "rJ1N3ONwDBK+jwFf+7xeHg==", "2xPbTqIww1KI/tVL2UH1cw==", 
        "1hk1AOU4DZXV52Auyr2FHA==", "aNH8uS5nrlwcHb8rLdZeXQ==", "5JPQs2z4N1Dru0dGI9ImBQ==", 
        "nbQIn5G4uyl8b1+A6aVkQg==", "A/UcwEccakKDuiATgoP1NA==", "JC57Ib2V7fOU/CgBk2R41g==", 
        "PsI8Ys++JveA+SuafbB8pg==", "eXiuBymYN+tcbjtpM9Vxmg==", "jEdP3Rs02d/4UE8G1GeE3A=="
        )), user_data.contacts = list(list(), NULL, list(), NULL, 
        list())), row.names = c(NA, 5L), class = "data.frame")
    

    我想使用以下方法执行JacCard计算:

    jaccard <- function(vector1, vector2) {
    
      return(length(intersect(vector1, vector2)) / 
               length(union(vector1, vector2)))
    
    }
    
    jaccardV <- Vectorize(jaccard)
    

    我想清理一下数据。

    我正在尝试清除空/na/0长度列表行。

    请告诉我正确的方法是什么,我首先想到的是:

    df %>% 
        dplyr::mutate(isNull = ifelse(is.null(unlist(user_data.contacts)), TRUE, FALSE))
    

    但它到处都是假的。

    有没有办法做到正确有效?请告知。

    最后,在清洁之后,我将运行:

    df %>% 
      cleaning %>%
    dplyr::mutate(contacts_jaccard = jaccardV(user_data.contacts, lag(user_data.contacts)))
    
    3 回复  |  直到 6 年前
        1
  •  1
  •   JBGruber    6 年前

    这是一种方法:

    library(dplyr)
    library(purrr)
    df %>% 
      filter(!map_lgl(user_data.contacts, is.null)) %>% 
      filter(!map_lgl(user_data.contacts, function(x) length(x) == 0)) %>% 
      filter(!map_lgl(user_data.contacts, is.na)) %>% 
      mutate(contacts_jaccard = jaccardV(user_data.contacts, lag(user_data.contacts)))
    

    这不会产生任何输出,因为在模拟数据中,您提供的所有行现在都被删除了。不需要制作新的列 isNull 如果唯一的目的是稍后删除行。我开始喜欢 map 结束 sapply 因为很容易强迫一个结果。在这种情况下, map_lgl 只会产生正确/错误的结果。

    请注意,如果列表列的一个元素长于1,则应改为使用:

    df %>% 
      filter(!map_lgl(user_data.contacts, is.null)) %>% 
      filter(!map_lgl(user_data.contacts, function(x) length(x) == 0)) %>% 
      filter(!map_lgl(user_data.contacts, function(x) is.na(x)[1])) %>% 
      mutate(contacts_jaccard = jaccardV(user_data.contacts, lag(user_data.contacts)))
    
        2
  •  1
  •   Henry Cyranka    6 年前

    我认为这对你有用。浏览“联系人”列,创建一个新列,告诉它是否为空,然后根据新列筛选它们。

    library(tidyverse)
    
    new_df <- df %>%as_tibble() %>%
        mutate(is_Null = sapply(user_data.contacts, is_null)) %>%
        filter(is_Null == FALSE)
    
    new_df
    
        3
  •  1
  •   parkerchad81    6 年前

    专栏 user_data.contacts 是一个列表,当您改变该列时,它将其视为一个列表。尝试对数据分组 rowwise 之前 mutate .

    library(dplyr)
    
    df %>% 
      rowwise() %>% 
      mutate(isNull = is.null(unlist(user_data.contacts)))
    
    
    # A tibble: 5 x 4 # added data to 1st observation for test
      keys.userId user_data.SSIDs user_data.contacts isNull
      <chr>       <list>          <list>             <lgl> 
    1 9875        <chr [47]>      <list [3]>         FALSE 
    2 5465        <NULL>          <NULL>             TRUE  
    3 1234        <chr [47]>      <list [0]>         TRUE  
    4 4567        <NULL>          <NULL>             TRUE  
    5 8910        <chr [47]>      <list [0]>         TRUE