代码之家  ›  专栏  ›  技术社区  ›  Prasanna Nandakumar

创建矩阵RTextTools包的并行计算

  •  5
  • Prasanna Nandakumar  · 技术社区  · 6 年前

    我正在创造一个 DocumentTermMatrix 使用 create_matrix() RTextTools 创造 container model

    我为每个类别(因子级别)都这样做。因此,对于每个类别,它必须运行矩阵、容器和模型。当我在(比如说16核/64 gb)中运行下面的代码时,它只在一个核中运行,并且使用的内存少于10%。

    有没有办法加快这个过程?也许是用 doparallel & foreach

    #import the required libraries
    library("RTextTools")
    library("hash")
    library(tm)
    
    for ( n in 1:length(folderaddress)){
        #Initialize the variables
        traindata = list()
        matrix = list()
        container = list()
        models = list()
        trainingdata = list()
        results = list()
        classifiermodeldiv = 0.80`
    
        #Create the directory to place the models and the output files
        pradd = paste(combinedmodelsaveaddress[n],"SelftestClassifierModels",sep="")
        if (!file.exists(pradd)){
            dir.create(file.path(pradd))
        }  
        Data$CATEGORY <- as.factor(Data$CATEGORY)
    
        #Read the training files
        X <- split(Data, Data$CATEGORY)
        data <- lapply(seq_along(X), function(x) as.data.frame(X[[x]])[,5])
        names(data) <- levels(Data$CATEGORY)
        list2env(data, envir = .GlobalEnv)
        files=as.matrix(names(data))
        fileno=length(files)
        fileno=as.integer(fileno)
        print(fileno)
    
        #For all the files in the training folder(the number of files in the training folder = Number of categories in Taxonomy)
        for(i in 1:fileno){
            filename = as.character(files[i,1])
            data1 = as.data.frame(data[i])
            data1 = as.matrix(data1)
            filenamechanged = gsub ("\\.[[:alnum:]]+","",filename)
            type = matrix(data = as.character(filenamechanged),nrow = length(data1[,1]),ncol=1 )
            data1 = cbind(data1,type)
            traindata[[i]] = data1
            print(i)
        }
    
        for(i in 1:fileno){
            #Obtain the unique classified data from the train files for one category
            trainingdata1 = as.data.frame(traindata[[i]][,1])
            uniquetraintweet = hash()
            typetrain1 = matrix(data=as.character(traindata[[i]][1,2]), ncol =1, nrow = length(trainingdata1[,1]))
    
            #If the training data is less than 10 records for a category, do not create a model
            #The model created based on a smaller set of data will not be accurate
            if (length(trainingdata1[,1])<200){
                matrix[[i]] = NULL
                next
            }
    
            #Obtain the unique classified data from the train files of all the other category except that is considered as training category
            trainingdata2=matrix(data="",nrow=0,ncol=1)
    
            for (j in 1:fileno){
                if ( j==i) next
                trainingdata2dummy = as.data.frame(traindata[[j]][,1])
                length(trainingdata1[,1])
                colnames(trainingdata2)="feedbacks"
                colnames(trainingdata2dummy)="feedbacks"
                trainingdata2 = rbind(trainingdata2,trainingdata2dummy)
    
            }
    
            #Consider one category as training set and make the remaining categories as Others
            typetrain2 = matrix(data="ZZOther",nrow=length(trainingdata2[,1]),ncol=1)
            colnames(trainingdata1)="feedbacks"
            trainingdata[[i]]=rbind(trainingdata1,trainingdata2)
            colnames(typetrain1)="type"
            colnames(typetrain2)="type"
            type=rbind(typetrain1,typetrain2)
            trainingdata[[i]] = cbind(trainingdata[[i]],type)
            trainingdata[[i]]=trainingdata[[i]][sample(nrow(trainingdata[[i]])),]
    
            #Input the training set and other set to the classifier
            mindoc = max(1,floor(min(0.001*length(trainingdata[[i]][,1]),3)))
    
            #Create Matrix        
            matrix[[i]] <- create_matrix(trainingdata[[i]][,1], language="english",
                                         removeNumbers=FALSE, stemWords=FALSE,weighting=weightTf,minWordLength=3, minDocFreq=mindoc, maxDocFreq=floor(0.5*(length(trainingdata[[i]][,1]))))
            #rowTotals <- apply(matrix[[i]] , 1, sum) #Find the sum of words in each Document
            #matrix[[i]]   <- matrix[[i]][rowTotals> 0,] 
            print(i)
    
            #Create Container             
            container[[i]] <- create_container(matrix[[i]],trainingdata[[i]][,2],trainSize=1:length(trainingdata[[i]][,1]),virgin=FALSE)
            print(i)
    
            #Create Models  
            models[[i]] <- train_models(container[[i]], algorithms=c("SVM"))
            print(i)
        }
    
        save(matrix, file = paste(pradd,"/Matrix",sep=""))
        save(models, file = paste(pradd,"/Models",sep=""))   
    }
    
    1 回复  |  直到 6 年前
        1
  •  4
  •   niko    6 年前

    下面是一个使用 RTextTools here .

    函数 myFun 遵循上面链接中的介绍-在最后它写入一个包含分析/摘要的csv文件(没有指定目录)。之后是直接应用 base R 包裹 parallel 我的乐趣

    library(parallel)
    library(RTextTools)
    # I. A dummy function
    # Uses RTextTools
    myFun <- function (trainMethod) {
      library(RTextTools)
      data(USCongress)
      # Create the document-term matrix
      doc_matrix <- create_matrix(USCongress$text, language="english", removeNumbers=TRUE,
                                  stemWords=TRUE, removeSparseTerms=.998)
      container <- create_container(doc_matrix, USCongress$major, trainSize=1:4000,
                                    testSize=4001:4449, virgin=FALSE)
      # Train
      model <- train_model(container,trainMethod)
      classify <- classify_model(container, model)
      # Analytics
      analytics <- create_analytics(container,
                                    cbind(classify))
      summary(analytics)
      # Saving
      nameToSave <- paste(trainMethod, 'DocumentSummary.csv', sep = '_')
      write.csv(analytics@document_summary, nameToSave)
    }
    
    # II. Parallel Processing
    # 
    # 1. Vector for parallelization & number of cores available
    trainMethods <- c('SVM','GLMNET','MAXENT','SLDA','BOOSTING')
    num_cores <- detectCores() - 1L
    # 2. Start a cluster
    cl <- makeCluster(num_cores)
    # 3. Export Variables needed to the cluster
    # specifying exactly which variables should be exported
    clusterExport(cl, varlist = c('myFun', 'trainMethods'))
    # 4. do in parallel
    parLapply(cl, seq_along(trainMethods), function (n) myFun(trainMethod = trainMethods[n]))
    # stop the cluster
    stopCluster(cl)
    

    在您的情况下,您必须将代码转换为函数 myFun (n, ...) 具有 n seq_along(folderaddress) 当然还有替代品 seq_along(trainMethods) 对于 顺序(文件夹地址) parLapply .

    当然,除了并行化之外,还有很多方法可以增强代码。问题是没有样本数据,任何改进建议都只是猜测。