代码之家  ›  专栏  ›  技术社区  ›  hello_friend

R、 获取数据并将excel/csv文件存储为单独的数据帧

  •  0
  • hello_friend  · 技术社区  · 6 年前

    我正在尝试以编程方式检索数据并将其存储在具有适当列名等的(命名的)数据帧中。我的for循环无法检索数据并将其存储在具有适当名称的数据帧中(请参阅:pg$Df\u names)。为你的帮助干杯。固定代码可供参考iv)-请提供您可能有任何改进。

    necessary_packages <- c("readr", "readxl", "xlsx","stringr", "dplyr", "tidyr", "rvest", "xml2","SnowballC", "httr", "Rcurl", "rvest")
    
    new_packages <- necessary_packages[!(necessary_packages %in% 
                                           installed.packages()[,"Package"])]
    
    if(length(new_packages)) install.packages(new_packages)
    
    lapply(necessary_packages, require, character.only = TRUE)
    

    i) 创建包含RBA数据的所有URL和数据帧名称的数据帧:

    Base_URL <- "http://www.rba.gov.au/statistics/tables/"
    

    ii)通过文本将html读入R parse,对于任何csv或excel文件:

    pg <- as.data.frame((html_attr(html_nodes(read_html(Base_URL), "a"), "href")), stringsAsFactors = FALSE)
    file_types <- c(".xls", ".csv", ".xlsx")
    pg <- as.data.frame(paste0(Base_URL, gsub("/statistics/tables/",
                                                "", 
                                                unique(grep(paste(file_types, collapse = "|"), 
                                                            pg[,1], value = TRUE)))), stringsAsFactors = FALSE)
    colnames(pg) <- "urls"
    

    iii)存储数据帧名称的字符向量:

    pg$Df_names <- gsub("\\-|[[:punct:]]", "_", 
                        gsub("\\..*",
                        "",
                        sapply(strsplit(pg$urls, split= "\\/"), function(x){x[length(x)]})))
    

    iv)将每个excel文件存储为一个单独的数据框,该数据框由字符向量的每个元素命名:

    df_names_list <- NULL
    files_to_remove <- NULL
    df_list <- list()
    h = 1
    i = 1
    j = 1
    for (i in 1:nrow(pg)) {
      if (length(grep(".xls", pg$urls[i])) > 0) {
        tryit <- try(GET(pg$urls[i], write_disk(tf <-
                                                  tempfile(fileext = ".xls"), overwrite = TRUE)))
        if (inherits(tryit, "try-error")) {
          i <- i + 1
        } else{
          GET(pg$urls[i], write_disk(tf <-
                                       tempfile(fileext = ".xls"), overwrite = TRUE))
          shiet <- excel_sheets(tf)[which(
            excel_sheets(tf) != "Summary" &
              excel_sheets(tf) != "Notes" &
              excel_sheets(tf) != "Series breaks"
          )]
        }
        for (j in 1:length(shiet)) {
          tryit <- try(tmp1 <-
                         as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE)))
          if (inherits(tryit, "try-error")) {
            j <- j + 1
          } else{
            tmp1 <-
              as.data.frame(read_xls(tf, sheet = shiet[j], col_names = FALSE))
          }
          if (ncol(tmp1) > 2) {
            tryit <- try(X <- as.data.frame(read_xls(
              gsub('[\\]', '\\/', tf),
              skip = 11,
              col_names = FALSE,
              sheet = shiet[j]
            )))
            if (inherits(tryit, "try-error")) {
              j <- j + 1
            } else{
              X <- as.data.frame(read_xls(
                gsub('[\\]', '\\/', tf),
                skip = 11,
                col_names = FALSE,
                sheet = shiet[j]
              ))
            }
            colnames(X) <- c("Date",
                             c(trimws(
                               gsub(
                                 "for series breaks.*|[[:punct:]]| |Seenot",
                                 "",
                                 tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
                                                             language = "porter"))
                               ),
                               which = c("both", "left", "right")
                             )))
            df_list[[h]] <- X
            df_names_list <-
              c(df_names_list, gsub(" ", "_", trimws(
                paste0(pg$Df_names[i], "_", shiet[j]),
                which = c("both", "left", "right")
              )))
            h <- h + 1
          } else{
            rm(tmp1)
            j <- j + 1
          }
        }
      } else if (length(grep(".xslx", pg$urls[i])) > 0) {
        tryit <- try(GET(pg$urls[i], write_disk(tf <-
                                                  tempfile(fileext = ".xlsx"), overwrite = TRUE)))
        if (inherits(tryit, "try-error")) {
          i <- i + 1
        } else{
          GET(pg$urls[i], write_disk(tf <-
                                       tempfile(fileext = ".xlsx"), overwrite = TRUE))
          shiet <- excel_sheets(tf)[which(
            excel_sheets(tf) != "Summary" &
              excel_sheets(tf) != "Notes" &
              excel_sheets(tf) != "Series breaks"
          )]
        }
        for (j in 1:length(shiet)) {
          tryit <- try(tmp1 <-
                         as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE)))
          if (inherits(tryit, "try-error")) {
            j <- j + 1
          } else{
            tmp1 <-
              as.data.frame(read_xlsx(tf, sheet = shiet[j], col_names = FALSE))
          }
          if (ncol(tmp1) > 2) {
            tryit <- try(X <- as.data.frame(read_xlsx(
              gsub('[\\]', '\\/', tf),
              skip = 11,
              col_names = FALSE,
              sheet = shiet[j]
            )))
            if (inherits(tryit, "try-error")) {
              j <- j + 1
            } else{
              X <- as.data.frame(read_xlsx(
                gsub('[\\]', '\\/', tf),
                skip = 11,
                col_names = FALSE,
                sheet = shiet[j]
              ))
            }
            colnames(X) <- c("Date",
                             c(trimws(
                               gsub(
                                 "for series breaks.*|[[:punct:]]| |Seenot",
                                 "",
                                 tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
                                                             language = "porter"))
                               ),
                               which = c("both", "left", "right")
                             )))
            df_list[[h]] <- X
            df_names_list <-
              c(df_names_list, gsub(" ", "_", trimws(
                paste0(pg$Df_names[i], "_", shiet[j]),
                which = c("both", "left", "right")
              )))
            h <- h + 1
          } else{
            rm(tmp1)
            j <- j + 1
          }
        }
      } else if (length(grep(".csv", pg$urls[i])) > 0) {
        tryit <- try(X <- as.data.frame(read.csv(pg$urls[i],
                                                 skip = 11,
                                                 header = FALSE)))
        if (inherits(tryit, "try-error")) {
          i <- i + 1
        } else {
          X <- as.data.frame(read.csv(pg$urls[i],
                                      skip = 11,
                                      header = FALSE))
        }
        tmp1 <- as.data.frame(read.csv(pg$urls[i]))
        if (ncol(tmp1) >= 2) {
          colnames(X) <- c("Date",
                           c(trimws(
                             gsub(
                               "for series breaks.*|[[:punct:]]| |Seenot",
                               "",
                               tools::toTitleCase(wordStem(tmp1[2, 2:ncol(tmp1)],
                                                           language = "porter"))
                             ),
                             which = c("both", "left", "right")
                           )))
          df_list[[h]] <- X
          df_names_list <-
            c(df_names_list, gsub(" ", "_", trimws(
              pg$Df_names[i],
              which = c("both", "left", "right")
            )))
          h <- h + 1
    
        } else{
          rm(tmp1)
          j <- j + 1
        }
      }
    }
    
    0 回复  |  直到 6 年前