结果_矩阵_检验的形状为(2,)变为(1,2)。你需要用scipy.sparse.csr_矩阵.整形(spar_mat,(-1,1))。
from sklearn.model_selection import train_test_split
import pandas as pd
from pandas import DataFrame
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
import scipy
big_X = pd.DataFrame({'Tags':['tag_a tag_b tag_c', 'tag_b tag_c', 'tag_b tag_c tag_d', 'tag_e tag_b tag_b tag_a'], 'Age':[20, 21, 19, 18]})
big_Y = pd.DataFrame({'Label':[0, 1, 0, 1]})
X_train, X_test, y_train, y_test = train_test_split(big_X, big_Y, test_size=0.5)
result_matrix_train = X_train['Age']
result_matrix_test = X_test['Age']
feature_colunm_name = "Tags"
cv = CountVectorizer(stop_words=None)
X_train_cv = cv.fit_transform(X_train[feature_colunm_name])
result_matrix_train.shape # (2,)
# explicity convert to csr matrix (your code did this implicitly when calling hstack)
spar_mat = scipy.sparse.csr_matrix(result_matrix_train.values)
# this now has the wrong shape
spar_mat.shape # (1,2)
# reshape this to be (n x 1)
spar_mat_shape = scipy.sparse.csr_matrix.reshape(spar_mat, (-1,1))
# this now has the right shape for hstack
spar_mat_shape.shape # (2, 1)
X_train_cv.shape # (2, 3)
# hstack succeeds
result_matrix_train = hstack((spar_mat_shape, X_train_cv))
result_matrix_train.shape # (2, 4)
# you need to do the same for the "test" portion of your code
result_matrix_test.shape
X_test_cv = cv.transform(X_test[feature_colunm_name])
# result_matrix_test = hstack((result_matrix_test, X_test_cv)) ... this would fail
# this will succeed:
spar_mat_test = scipy.sparse.csr_matrix(result_matrix_test.values)
spar_mat_test_shape = scipy.sparse.csr_matrix.reshape(spar_mat_test, (-1,1))
result_matrix_test = hstack((spar_mat_test_shape, X_test_cv))
result_matrix_test.shape # (2,5)