代码之家  ›  专栏  ›  技术社区  ›  kukelia

在自定义转换器内创建新数据帧时,SKlearn管道无法工作

  •  0
  • kukelia  · 技术社区  · 2 年前

    我有一个带有管道和柱状变压器的管道,还有一些定制变压器 我如何解决这个问题:

    Input In [8], in <cell line: 21>()
         19 # Fit all (1) models defined in our model-search object
         20 print(X_train.shape)
    ---> 21 best = cv_model_search.fit(X_train,y_train)
    
    File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
        885     results = self._format_results(
        886         all_candidate_params, n_splits, all_out, all_more_results
        887     )
        889     return results
    --> 891 self._run_search(evaluate_candidates)
        893 # multimetric is determined here because in the case of a callable
        894 # self.scoring the return type is only known after calling
        895 first_test_score = all_out[0]["test_scores"]
    
    File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:1392, in GridSearchCV._run_search(self, evaluate_candidates)
       1390 def _run_search(self, evaluate_candidates):
       1391     """Search all candidates in param_grid"""
    -> 1392     evaluate_candidates(ParameterGrid(self.param_grid))
    
    File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
        830 if self.verbose > 0:
        831     print(
        832         "Fitting {0} folds for each of {1} candidates,"
        833         " totalling {2} fits".format(
        834             n_splits, n_candidates, n_candidates * n_splits
        835         )
        836     )
    --> 838 out = parallel(
        839     delayed(_fit_and_score)(
        840         clone(base_estimator),
        841         X,
        842         y,
        843         train=train,
        844         test=test,
        845         parameters=parameters,
        846         split_progress=(split_idx, n_splits),
        847         candidate_progress=(cand_idx, n_candidates),
        848         **fit_and_score_kwargs,
        849     )
        850     for (cand_idx, parameters), (split_idx, (train, test)) in product(
        851         enumerate(candidate_params), enumerate(cv.split(X, y, groups))
        852     )
        853 )
        855 if len(out) < 1:
        856     raise ValueError(
        857         "No fits were performed. "
        858         "Was the CV iterator empty? "
        859         "Were there no candidates?"
        860     )
    
    File ~\anaconda3\lib\site-packages\joblib\parallel.py:1043, in Parallel.__call__(self, iterable)
       1034 try:
       1035     # Only set self._iterating to True if at least a batch
       1036     # was dispatched. In particular this covers the edge
       (...)
       1040     # was very quick and its callback already dispatched all the
       1041     # remaining jobs.
       1042     self._iterating = False
    -> 1043     if self.dispatch_one_batch(iterator):
       1044         self._iterating = self._original_iterator is not None
       1046     while self.dispatch_one_batch(iterator):
    
    File ~\anaconda3\lib\site-packages\joblib\parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
        859     return False
        860 else:
    --> 861     self._dispatch(tasks)
        862     return True
    
    File ~\anaconda3\lib\site-packages\joblib\parallel.py:779, in Parallel._dispatch(self, batch)
        777 with self._lock:
        778     job_idx = len(self._jobs)
    --> 779     job = self._backend.apply_async(batch, callback=cb)
        780     # A job can complete so quickly than its callback is
        781     # called before we get here, causing self._jobs to
        782     # grow. To ensure correct results ordering, .insert is
        783     # used (rather than .append) in the following line
        784     self._jobs.insert(job_idx, job)
    
    File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
        206 def apply_async(self, func, callback=None):
        207     """Schedule a func to be run"""
    --> 208     result = ImmediateResult(func)
        209     if callback:
        210         callback(result)
    
    File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
        569 def __init__(self, batch):
        570     # Don't delay the application, to avoid keeping the input
        571     # arguments in memory
    --> 572     self.results = batch()
    
    File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in BatchedCalls.__call__(self)
        258 def __call__(self):
        259     # Set the default nested backend to self._backend but do not set the
        260     # change the default number of processes to -1
        261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
    --> 262         return [func(*args, **kwargs)
        263                 for func, args, kwargs in self.items]
    
    File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in <listcomp>(.0)
        258 def __call__(self):
        259     # Set the default nested backend to self._backend but do not set the
        260     # change the default number of processes to -1
        261     with parallel_backend(self._backend, n_jobs=self._n_jobs):
    --> 262         return [func(*args, **kwargs)
        263                 for func, args, kwargs in self.items]
    
    File ~\anaconda3\lib\site-packages\sklearn\utils\fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
        214 def __call__(self, *args, **kwargs):
        215     with config_context(**self.config):
    --> 216         return self.function(*args, **kwargs)
    
    File ~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:680, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
        678         estimator.fit(X_train, **fit_params)
        679     else:
    --> 680         estimator.fit(X_train, y_train, **fit_params)
        682 except Exception:
        683     # Note fit time as time until error
        684     fit_time = time.time() - start_time
    
    File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
        364 """Fit the model.
        365 
        366 Fit all the transformers one after the other and transform the
       (...)
        387     Pipeline with fitted steps.
        388 """
        389 fit_params_steps = self._check_fit_params(**fit_params)
    --> 390 Xt = self._fit(X, y, **fit_params_steps)
        391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
        392     if self._final_estimator != "passthrough":
    
    File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
        346     cloned_transformer = clone(transformer)
        347 # Fit or load from cache the current transformer
    --> 348 X, fitted_transformer = fit_transform_one_cached(
        349     cloned_transformer,
        350     X,
        351     y,
        352     None,
        353     message_clsname="Pipeline",
        354     message=self._log_message(step_idx),
        355     **fit_params_steps[name],
        356 )
        357 # Replace the transformer of the step with the fitted
        358 # transformer. This is necessary when loading the transformer
        359 # from the cache.
        360 self.steps[step_idx] = (name, fitted_transformer)
    
    File ~\anaconda3\lib\site-packages\joblib\memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
        348 def __call__(self, *args, **kwargs):
    --> 349     return self.func(*args, **kwargs)
    
    File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
        891 with _print_elapsed_time(message_clsname, message):
        892     if hasattr(transformer, "fit_transform"):
    --> 893         res = transformer.fit_transform(X, y, **fit_params)
        894     else:
        895         res = transformer.fit(X, y, **fit_params).transform(X)
    
    File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
        432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
        433 if hasattr(last_step, "fit_transform"):
    --> 434     return last_step.fit_transform(Xt, y, **fit_params_last_step)
        435 else:
        436     return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
    
    File ~\anaconda3\lib\site-packages\sklearn\base.py:855, in TransformerMixin.fit_transform(self, X, y, **fit_params)
        852     return self.fit(X, **fit_params).transform(X)
        853 else:
        854     # fit method of arity 2 (supervised transformation)
    --> 855     return self.fit(X, y, **fit_params).transform(X)
    
    Input In [5], in MakeDataFrame.transform(self, X)
        170 def transform(self, X):
    --> 171     return pd.DataFrame(data=X, index=np.arange(len(X)), columns=self.columns)
    
    File ~\anaconda3\lib\site-packages\pandas\core\frame.py:694, in DataFrame.__init__(self, data, index, columns, dtype, copy)
        684         mgr = dict_to_mgr(
        685             # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
        686             # attribute "name"
       (...)
        691             typ=manager,
        692         )
        693     else:
    --> 694         mgr = ndarray_to_mgr(
        695             data,
        696             index,
        697             columns,
        698             dtype=dtype,
        699             copy=copy,
        700             typ=manager,
        701         )
        703 # For data is list-like, or Iterable (will consume into list)
        704 elif is_list_like(data):
    
    File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:351, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
        346 # _prep_ndarray ensures that values.ndim == 2 at this point
        347 index, columns = _get_axes(
        348     values.shape[0], values.shape[1], index=index, columns=columns
        349 )
    --> 351 _check_values_indices_shape_match(values, index, columns)
        353 if typ == "array":
        355     if issubclass(values.dtype.type, str):
    
    File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:422, in _check_values_indices_shape_match(values, index, columns)
        420 passed = values.shape
        421 implied = (len(index), len(columns))
    --> 422 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
    
    ValueError: Shape of passed values is (730, 167), indices imply (730, 163)
    
    

    Stack说我的帖子是纯代码,所以我添加了以下内容: 《美国习惯性文本》中的《同一篇文章》中的《蒂波格拉夫的民主政策》中的《世界各地的博拉多雷斯》《世界各地的准文本》中的《最后文本》中的《插入者的视觉赌注》

    1 回复  |  直到 2 年前
        1
  •  1
  •   richardec    2 年前

    设置 columns = X.columns 在你的定制Transformer中。