most likely bug->
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
the the fit call to GridSearchCV->
gs_mnb.fit(X_train, y_train)
here's the pipleline used in my code->
pipe_mnb = Pipeline([
('vect', TfidfVectorizer(lowercase=False)),
('mnb', MultinomialNB())
])
The GridSearchCV (gs_mnb) ->
params_mnb = {
'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3)],
'vect__max_df': [50, 70, 90],
# 'vect__min_df': [0.001, 0.002, 0.003, 0.004, 1],
'vect__max_features': [3000, 5000, 7000, 9000, 11000, None],
'vect__binary': [True, False],
'vect__sublinear_tf': [True, False],
}
gs_mnb = GridSearchCV(pipe_mnb, params_mnb, verbose=10, cv=5, n_jobs=-1, scoring='accuracy', error_score='raise')
Shapes of X, y, X_train, y_train-> (120000, 1) (120000, 1) (84000, 1) (84000, 1)
tried using transpose function and reshaping function but the error persists.
Error->
--------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 463, in _process_worker
r = call_item()
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/externals/loky/process_executor.py", line 291, in __call__
return self.fn(*self.args, **self.kwargs)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py", line 588, in __call__
return [func(*args, **kwargs)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py", line 588, in <listcomp>
return [func(*args, **kwargs)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/parallel.py", line 123, in __call__
return self.function(*args, **kwargs)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/pipeline.py", line 405, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 749, in fit
X, y = self._check_X_y(X, y)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/naive_bayes.py", line 583, in _check_X_y
return self._validate_data(X, y, accept_sparse="csr", reset=reset)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/base.py", line 565, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1124, in check_X_y
check_consistent_length(X, y)
File "/home/mist/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/validation.py", line 397, in check_consistent_length
raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [1, 67200]
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[115], line 24
22 print(X.shape, y.shape, X_train.shape, y_train.shape)
23 gs_mnb = GridSearchCV(pipe_mnb, params_mnb, verbose=10, cv=5, n_jobs=-1, scoring='accuracy', error_score='raise')
---> 24 gs_mnb.fit(X_train, y_train)
27 print('Best accuracy: ', gs_mnb.best_score_, end='\n')
28 print('Params: ', gs_mnb.best_params_)
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/model_selection/_search.py:874, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
868 results = self._format_results(
869 all_candidate_params, n_splits, all_out, all_more_results
870 )
872 return results
--> 874 self._run_search(evaluate_candidates)
876 # multimetric is determined here because in the case of a callable
877 # self.scoring the return type is only known after calling
878 first_test_score = all_out[0]["test_scores"]
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/model_selection/_search.py:1388, in GridSearchCV._run_search(self, evaluate_candidates)
1386 def _run_search(self, evaluate_candidates):
1387 """Search all candidates in param_grid"""
-> 1388 evaluate_candidates(ParameterGrid(self.param_grid))
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/model_selection/_search.py:821, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
813 if self.verbose > 0:
814 print(
815 "Fitting {0} folds for each of {1} candidates,"
816 " totalling {2} fits".format(
817 n_splits, n_candidates, n_candidates * n_splits
818 )
819 )
--> 821 out = parallel(
822 delayed(_fit_and_score)(
823 clone(base_estimator),
824 X,
825 y,
826 train=train,
827 test=test,
828 parameters=parameters,
829 split_progress=(split_idx, n_splits),
830 candidate_progress=(cand_idx, n_candidates),
831 **fit_and_score_kwargs,
832 )
833 for (cand_idx, parameters), (split_idx, (train, test)) in product(
834 enumerate(candidate_params), enumerate(cv.split(X, y, groups))
835 )
836 )
838 if len(out) < 1:
839 raise ValueError(
840 "No fits were performed. "
841 "Was the CV iterator empty? "
842 "Were there no candidates?"
843 )
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/sklearn/utils/parallel.py:63, in Parallel.__call__(self, iterable)
58 config = get_config()
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py:1944, in Parallel.__call__(self, iterable)
1938 # The first item from the output is blank, but it makes the interpreter
1939 # progress until it enters the Try/Except block of the generator and
1940 # reach the first `yield` statement. This starts the aynchronous
1941 # dispatch of the tasks to the workers.
1942 next(output)
-> 1944 return output if self.return_generator else list(output)
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py:1587, in Parallel._get_outputs(self, iterator, pre_dispatch)
1584 yield
1586 with self._backend.retrieval_context():
-> 1587 yield from self._retrieve()
1589 except GeneratorExit:
1590 # The generator has been garbage collected before being fully
1591 # consumed. This aborts the remaining tasks if possible and warn
1592 # the user if necessary.
1593 self._exception = True
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py:1691, in Parallel._retrieve(self)
1684 while self._wait_retrieval():
1685
1686 # If the callback thread of a worker has signaled that its task
1687 # triggered an exception, or if the retrieval loop has raised an
1688 # exception (e.g. `GeneratorExit`), exit the loop and surface the
1689 # worker traceback.
1690 if self._aborting:
-> 1691 self._raise_error_fast()
1692 break
1694 # If the next job is not ready for retrieval yet, we just wait for
1695 # async callbacks to progress.
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py:1726, in Parallel._raise_error_fast(self)
1722 # If this error job exists, immediatly raise the error by
1723 # calling get_result. This job might not exists if abort has been
1724 # called directly or if the generator is gc'ed.
1725 if error_job is not None:
-> 1726 error_job.get_result(self.timeout)
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py:735, in BatchCompletionCallBack.get_result(self, timeout)
729 backend = self.parallel._backend
731 if backend.supports_retrieve_callback:
732 # We assume that the result has already been retrieved by the
733 # callback thread, and is stored internally. It's just waiting to
734 # be returned.
--> 735 return self._return_or_raise()
737 # For other backends, the main thread needs to run the retrieval step.
738 try:
File ~/anaconda3/envs/ml/lib/python3.8/site-packages/joblib/parallel.py:753, in BatchCompletionCallBack._return_or_raise(self)
751 try:
752 if self.status == TASK_ERROR:
--> 753 raise self._result
754 return self._result
755 finally:
ValueError: Found input variables with inconsistent numbers of samples: [1, 67200]
How can I fix it?