I have a pipeline with pipelines and columntransformers, with some custom transformers How can I fix this:
Input In [8], in <cell line: 21>()
19 # Fit all (1) models defined in our model-search object
20 print(X_train.shape)
---> 21 best = cv_model_search.fit(X_train,y_train)
File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
885 results = self._format_results(
886 all_candidate_params, n_splits, all_out, all_more_results
887 )
889 return results
--> 891 self._run_search(evaluate_candidates)
893 # multimetric is determined here because in the case of a callable
894 # self.scoring the return type is only known after calling
895 first_test_score = all_out[0]["test_scores"]
File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:1392, in GridSearchCV._run_search(self, evaluate_candidates)
1390 def _run_search(self, evaluate_candidates):
1391 """Search all candidates in param_grid"""
-> 1392 evaluate_candidates(ParameterGrid(self.param_grid))
File ~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
830 if self.verbose > 0:
831 print(
832 "Fitting {0} folds for each of {1} candidates,"
833 " totalling {2} fits".format(
834 n_splits, n_candidates, n_candidates * n_splits
835 )
836 )
--> 838 out = parallel(
839 delayed(_fit_and_score)(
840 clone(base_estimator),
841 X,
842 y,
843 train=train,
844 test=test,
845 parameters=parameters,
846 split_progress=(split_idx, n_splits),
847 candidate_progress=(cand_idx, n_candidates),
848 **fit_and_score_kwargs,
849 )
850 for (cand_idx, parameters), (split_idx, (train, test)) in product(
851 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
852 )
853 )
855 if len(out) < 1:
856 raise ValueError(
857 "No fits were performed. "
858 "Was the CV iterator empty? "
859 "Were there no candidates?"
860 )
File ~\anaconda3\lib\site-packages\joblib\parallel.py:1043, in Parallel.__call__(self, iterable)
1034 try:
1035 # Only set self._iterating to True if at least a batch
1036 # was dispatched. In particular this covers the edge
(...)
1040 # was very quick and its callback already dispatched all the
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1046 while self.dispatch_one_batch(iterator):
File ~\anaconda3\lib\site-packages\joblib\parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
File ~\anaconda3\lib\site-packages\joblib\parallel.py:779, in Parallel._dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
782 # grow. To ensure correct results ordering, .insert is
783 # used (rather than .append) in the following line
784 self._jobs.insert(job_idx, job)
File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~\anaconda3\lib\site-packages\joblib\_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
569 def __init__(self, batch):
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in BatchedCalls.__call__(self)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~\anaconda3\lib\site-packages\joblib\parallel.py:262, in <listcomp>(.0)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~\anaconda3\lib\site-packages\sklearn\utils\fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
File ~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py:680, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
678 estimator.fit(X_train, **fit_params)
679 else:
--> 680 estimator.fit(X_train, y_train, **fit_params)
682 except Exception:
683 # Note fit time as time until error
684 fit_time = time.time() - start_time
File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
364 """Fit the model.
365
366 Fit all the transformers one after the other and transform the
(...)
387 Pipeline with fitted steps.
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
351 y,
352 None,
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
358 # transformer. This is necessary when loading the transformer
359 # from the cache.
360 self.steps[step_idx] = (name, fitted_transformer)
File ~\anaconda3\lib\site-packages\joblib\memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File ~\anaconda3\lib\site-packages\sklearn\pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
File ~\anaconda3\lib\site-packages\sklearn\base.py:855, in TransformerMixin.fit_transform(self, X, y, **fit_params)
852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
--> 855 return self.fit(X, y, **fit_params).transform(X)
Input In [5], in MakeDataFrame.transform(self, X)
170 def transform(self, X):
--> 171 return pd.DataFrame(data=X, index=np.arange(len(X)), columns=self.columns)
File ~\anaconda3\lib\site-packages\pandas\core\frame.py:694, in DataFrame.__init__(self, data, index, columns, dtype, copy)
684 mgr = dict_to_mgr(
685 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
686 # attribute "name"
(...)
691 typ=manager,
692 )
693 else:
--> 694 mgr = ndarray_to_mgr(
695 data,
696 index,
697 columns,
698 dtype=dtype,
699 copy=copy,
700 typ=manager,
701 )
703 # For data is list-like, or Iterable (will consume into list)
704 elif is_list_like(data):
File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:351, in ndarray_to_mgr(values, index, columns, dtype, copy, typ)
346 # _prep_ndarray ensures that values.ndim == 2 at this point
347 index, columns = _get_axes(
348 values.shape[0], values.shape[1], index=index, columns=columns
349 )
--> 351 _check_values_indices_shape_match(values, index, columns)
353 if typ == "array":
355 if issubclass(values.dtype.type, str):
File ~\anaconda3\lib\site-packages\pandas\core\internals\construction.py:422, in _check_values_indices_shape_match(values, index, columns)
420 passed = values.shape
421 implied = (len(index), len(columns))
--> 422 raise ValueError(f"Shape of passed values is {passed}, indices imply {implied}")
ValueError: Shape of passed values is (730, 167), indices imply (730, 163)
Stack is saying my post is pure code so I'm adding this: Lorem ipsum es el texto que se usa habitualmente en diseño gráfico en demostraciones de tipografías o de borradores de diseño para probar el diseño visual antes de insertar el texto final
CodePudding user response:
set columns = X.columns
in your custom transfomer.