代码之家  ›  专栏  ›  技术社区  ›  cs0815

使用ifelse将NA映射到未知

r
  •  0
  • cs0815  · 技术社区  · 6 年前

    我正在使用:

    raw <- c('0', '13', 'NULL')
    
    data <- data.frame(raw)
    data$number <- as.numeric(as.character(data$raw))
    
    data
    
    data$category <- ifelse(data$number == 0, "0",
    ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
    ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
    ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
    ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
    ifelse(data$number > 62, "63++",
    ifelse(is.na(data$number) == TRUE, "unknown",
    "unknown")))))))
    
    data
    

       raw number   category
    1    0      0          0
    2   13     13 [8 ... 14]
    3 NULL     NA       <NA>
    

    而不是:

       raw number   category
    1    0      0          0
    2   13     13 [8 ... 14]
    3 NULL     NA       unknown
    

    data$category[which(is.na(data$number))] = "unknown"
    

    在上面的代码块之后。

    6 回复  |  直到 6 年前
        1
  •  4
  •   RLave    6 年前

    cut() 功能。

    data$category <- cut(data$number, 
        breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), # you decide the cuts
        labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++")) 
        # labels for each category
    

    不幸的是,您需要这两行代码来转换 NA "Unknown"

    levels(data$category) <- c(levels(data$category), "Unknown")
    data$category[is.na(data$number)] <- "Unknown"
    data
    #     raw number category
    # 1    0      0        0
    # 2   13     13  [8..14]
    # 3 NULL     NA  Unknown
    

    数据:

    raw <- c('0', '13', 'NULL')
    
    data <- data.frame(raw)
    data$number <- as.numeric(as.character(data$raw))
    

    microbenchmark::microbenchmark(
      #cut
      cut = {data$category <- cut(data$number, 
                                  breaks=c(-Inf, 0, 7, 14, 31, 62, Inf), 
                                  labels = c("0", "[1..7]", "[8..14]", "[15 ... 31]", "[32 ... 62]", "63++"))
      levels(data$category) <- c(levels(data$category), "Unknown")
      data$category[is.na(data$number)] <- "Unknown"},
      #findInt
      findInt = {vec<-c(0,7,14,31,62)
      levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
                paste0(vec[length(vec)]+1,"++"))
      res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
      res[is.na(res)]<-"unknown"},
      # lapply
      lapply = {data$category <- lapply(data$number,function(x) {
        if(is.na(x) || is.null(x)) "unknown"
        else if(x == 0) "0"
        else if(x > 0 & x <= 7) "[1 ... 7]"
        else if(x > 7 & x <= 14) "[8 ... 14]"
        else if(x > 14 & x <= 31) "[15 ... 31]"
        else if(x > 31 & x <= 62) "[32 ... 62]"
        else if(x > 62) "63++"
        else "unknown"
      })},
      # ifelse
      ifelse = {data$category <- 
        ifelse(is.na(data$number), "unknown", 
               ifelse(data$number == 0, "0",
                      ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
                             ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
                                    ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
                                           ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
                                                  ifelse(data$number > 62, "63++", "???")))))))}
                                   )
    

    给予:

    # Unit: microseconds
    #    expr     min       lq       mean   median       uq        max neval
    #     cut 132.207 139.4185  154.78149 144.9770 154.5925    283.043   100
    # findInt  18.329  21.7850   26.58004  26.2915  28.8460     60.996   100
    #  lapply  14.122  15.6250 4269.73574  17.2770  18.7800 425198.055   100
    #  ifelse  81.728  84.8835   96.09675  88.9400  96.3010    193.503   100
    
        2
  •  1
  •   jay.sf    6 年前

    也许你更愿意用条件赋值 within() ,这更清楚。

    data <- within(data, {
      category <- NA
      category[number == 0] <- 0
      category[number > 0 & number <= 7] <- "[1 ... 7]"
      category[number > 7 & number <= 14] <- "[8 ... 14]"
      category[number > 14 & number <= 31] <- "[15 ... 31]"
      category[number > 31 & number <= 62] <- "[32 ... 62]"
      category[number > 62] <- "[32 ... 62]"
      category[is.na(number)] <- "unknown"
    })
    
    > data
       raw number   category
    1    0      0          0
    2   13     13 [8 ... 14]
    3 NULL     NA    unknown
    
        3
  •  1
  •   NelsonGon phoxis    6 年前

    我不知道这是否能让你继续你现在的方法:重命名 data df 数据

    df$category[is.na(df$category)]<-"Unknown"
    df$category
    
        4
  •  1
  •   s_baldur    6 年前

    如果你搬家 is.na() 开始时,您当前的代码将工作:

    data$category <- 
      ifelse(is.na(data$number), "unknown", 
        ifelse(data$number == 0, "0",
          ifelse(data$number > 0 & data$number <= 7, "[1 ... 7]",
            ifelse(data$number > 7 & data$number <= 14, "[8 ... 14]",
              ifelse(data$number > 14 & data$number <= 31, "[15 ... 31]",
                ifelse(data$number > 31 & data$number <= 62, "[32 ... 62]",
                  ifelse(data$number > 62, "63++", "???")))))))
    
    > data
       raw number   category
    1    0      0          0
    2   13     13 [8 ... 14]
    3 NULL     NA    unknown
    
        5
  •  1
  •   Soren    6 年前

    data$category <- lapply(data$number,function(x) {
      if(is.na(x) || is.null(x)) "unknown"
      else if(x == 0) "0"
      else if(x > 0 & x <= 7) "[1 ... 7]"
      else if(x > 7 & x <= 14) "[8 ... 14]"
      else if(x > 14 & x <= 31) "[15 ... 31]"
      else if(x > 31 & x <= 62) "[32 ... 62]"
      else if(x > 62) "63++"
      else "unknown"
    })
    
        6
  •  1
  •   nicola    6 年前

    试试这个:

    #define a vector with the range values
    vec<-c(0,7,14,31,62)
    #create your labels
    levels<-c(vec[1],sprintf("[%d ... %d]",(vec+1)[-length(vec)],vec[-1]),
                    paste0(vec[length(vec)]+1,"++"))
    #use findInterval to create your result
    res<-levels[findInterval(data$number,vec,left.open=TRUE)+1]
    #substitute the NA's
    res[is.na(res)]<-"unknown"