代码之家  ›  专栏  ›  技术社区  ›  Lin Ma

连接两个数据帧时出现奇怪的python数据帧维度问题

  •  0
  • Lin Ma  · 技术社区  · 5 年前

    这是我要复制的代码和错误消息。我还打印了数据帧的原始内容和形状以进行连接(使用hstack),看起来还可以,想知道错误是什么?

    from sklearn.model_selection import train_test_split
    import pandas as pd
    from pandas import DataFrame
    from scipy.sparse import hstack
    from sklearn.feature_extraction.text import CountVectorizer
    
    big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
    big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})  
    
    X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
    result_matrix_train = X_train['Age']
    result_matrix_test = X_test['Age']
    
    sparse_columns = ['Tags']  
    for feature_colunm_name in sparse_columns:
      print('processing feature name: ', feature_colunm_name)
      cv = CountVectorizer(stop_words=None)
      X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
      print ('X_train_cv: ', X_train_cv)
      print ('result_matrix_train: ', result_matrix_train)
    
      # Merge the vector with others
      if result_matrix_train is not None:
        print (result_matrix_train)
        print (X_train_cv)
        result_matrix_train = hstack((result_matrix_train, X_train_cv))
      else:
        result_matrix_train = X_train_cv
    
      # Now transform the test data
      X_test_cv = cv.transform(X_test[feature_colunm_name])
      if result_matrix_test is not None:
        result_matrix_test = hstack((result_matrix_test, X_test_cv))
      else:
        result_matrix_test = X_test_cv
    

    错误消息,

         24     print (result_matrix_train)
         25     print (X_train_cv)
    ---> 26     result_matrix_train = hstack((result_matrix_train, X_train_cv))
         27   else:
         28     result_matrix_train = X_train_cv
    
        584                                                     exp=brow_lengths[i],
        585                                                     got=A.shape[0]))
    --> 586                     raise ValueError(msg)
        587 
        588                 if bcol_lengths[j] == 0:
    
    ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 2, expected 1.
    
    1 回复  |  直到 5 年前
        1
  •  1
  •   AidanGawronski    5 年前

    结果_矩阵_检验的形状为(2,)变为(1,2)。你需要用scipy.sparse.csr_矩阵.整形(spar_mat,(-1,1))。

    from sklearn.model_selection import train_test_split
    import pandas as pd
    from pandas import DataFrame
    from scipy.sparse import hstack
    from sklearn.feature_extraction.text import CountVectorizer
    import scipy
    
    big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
    big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})  
    
    X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
    result_matrix_train = X_train['Age']
    result_matrix_test = X_test['Age']
    
    feature_colunm_name = "Tags"
    cv = CountVectorizer(stop_words=None)
    X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
    
    result_matrix_train.shape # (2,)
    
    # explicity convert to csr matrix (your code did this implicitly when calling hstack)
    spar_mat = scipy.sparse.csr_matrix(result_matrix_train.values)
    
    # this now has the wrong shape
    spar_mat.shape # (1,2)
    
    # reshape this to be (n x 1)
    spar_mat_shape = scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1))
    
    # this now has the right shape for hstack
    spar_mat_shape.shape # (2, 1)
    X_train_cv.shape # (2, 3)
    
    # hstack succeeds
    result_matrix_train = hstack((spar_mat_shape, X_train_cv))
    result_matrix_train.shape # (2, 4)
    
    # you need to do the same for the "test" portion of your code
    result_matrix_test.shape
    X_test_cv = cv.transform(X_test[feature_colunm_name])
    
    # result_matrix_test = hstack((result_matrix_test, X_test_cv)) ... this would fail
    # this will succeed:
    spar_mat_test = scipy.sparse.csr_matrix(result_matrix_test.values)
    spar_mat_test_shape = scipy.sparse.csr_matrix.reshape(spar_mat_test, (-1,1))
    result_matrix_test = hstack((spar_mat_test_shape, X_test_cv))
    result_matrix_test.shape # (2,5)