I am trying to perform nested cross-validation while also incorporating group-based splitting using the GroupShuffleSplit class. However, I'm encountering a "TypeError: cannot pickle 'generator' object" when trying to use a custom cross-validation object with GridSearchCV. As fas as i know this Error occurs because group_split.split(...) returns an generator which cant be used in the cross_val_score function. Therefore i want to ask if there is a way to easily use GroupShuffleSplit for nested cross-validation.
Regarding my simplified sample code:
I have a dataset with features X, labels y, and group labels groups. The goal is to perform nested cross-validation, where both the inner and outer loops split the data based on the group labels. I would like to use GridSearchCV for hyperparameter tuning and cross_val_score for evaluating the performance.
import numpy as np
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
X = np.random.rand(100, 10)
y = np.random.randint(2, size=100)
groups = np.random.randint(4, size=100) # Example group labels
rf_classifier = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 200]}
inner_cv = GroupShuffleSplit(n_splits=5, test_size=0.2)
outer_cv = GroupShuffleSplit(n_splits=5, test_size=0.2)
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=inner_cv.split(X, y, groups=groups))
nested_scores = cross_val_score(estimator=grid_search, X=X, y=y, cv=outer_cv.split(X, y, groups=groups))
Resulting in the following Stacktrace Error:
---------------------------------------------------------------------------
Empty Traceback (most recent call last)
File c:\Anaconda3_x64\lib\site-packages\joblib\parallel.py:825, in Parallel.dispatch_one_batch(self, iterator)
824 try:
--> 825 tasks = self._ready_batches.get(block=False)
826 except queue.Empty:
827 # slice the iterator n_jobs * batchsize items at a time. If the
828 # slice returns less than that, then the current batchsize puts
(...)
831 # accordingly to distribute evenly the last items between all
832 # workers.
File c:\Anaconda3_x64\lib\queue.py:168, in Queue.get(self, block, timeout)
167 if not self._qsize():
--> 168 raise Empty
169 elif timeout is None:
Empty:
During handling of the above exception, another exception occurred:
TypeError Traceback (most recent call last)
Cell In[29], line 16
13 outer_cv = GroupShuffleSplit(n_splits=5, test_size=0.2)
15 grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=inner_cv.split(X, y, groups=groups))
---> 16 nested_scores = cross_val_score(estimator=grid_search, X=X, y=y, cv=outer_cv.split(X, y, groups=groups))
18 print(nested_scores)
File c:\Anaconda3_x64\lib\site-packages\sklearn\model_selection\_validation.py:515, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
512 # To ensure multimetric format is not supported
513 scorer = check_scoring(estimator, scoring=scoring)
--> 515 cv_results = cross_validate(
516 estimator=estimator,
517 X=X,
518 y=y,
519 groups=groups,
520 scoring={"score": scorer},
521 cv=cv,
522 n_jobs=n_jobs,
523 verbose=verbose,
524 fit_params=fit_params,
525 pre_dispatch=pre_dispatch,
526 error_score=error_score,
527 )
528 return cv_results["test_score"]
File c:\Anaconda3_x64\lib\site-packages\sklearn\model_selection\_validation.py:266, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 266 results = parallel(
267 delayed(_fit_and_score)(
268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File c:\Anaconda3_x64\lib\site-packages\sklearn\utils\parallel.py:63, in Parallel.__call__(self, iterable)
58 config = get_config()
59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
---> 63 return super().__call__(iterable_with_config)
File c:\Anaconda3_x64\lib\site-packages\joblib\parallel.py:1048, in Parallel.__call__(self, iterable)
1039 try:
1040 # Only set self._iterating to True if at least a batch
1041 # was dispatched. In particular this covers the edge
(...)
1045 # was very quick and its callback already dispatched all the
1046 # remaining jobs.
1047 self._iterating = False
-> 1048 if self.dispatch_one_batch(iterator):
1049 self._iterating = self._original_iterator is not None
1051 while self.dispatch_one_batch(iterator):
File c:\Anaconda3_x64\lib\site-packages\joblib\parallel.py:836, in Parallel.dispatch_one_batch(self, iterator)
833 n_jobs = self._cached_effective_n_jobs
834 big_batch_size = batch_size * n_jobs
--> 836 islice = list(itertools.islice(iterator, big_batch_size))
837 if len(islice) == 0:
838 return False
File c:\Anaconda3_x64\lib\site-packages\sklearn\utils\parallel.py:59, in <genexpr>(.0)
54 # Capture the thread-local scikit-learn configuration at the time
55 # Parallel.__call__ is issued since the tasks can be dispatched
56 # in a different thread depending on the backend and on the value of
57 # pre_dispatch and n_jobs.
58 config = get_config()
---> 59 iterable_with_config = (
60 (_with_config(delayed_func, config), args, kwargs)
61 for delayed_func, args, kwargs in iterable
62 )
63 return super().__call__(iterable_with_config)
File c:\Anaconda3_x64\lib\site-packages\sklearn\model_selection\_validation.py:268, in <genexpr>(.0)
263 # We clone the estimator to make sure that all the folds are
264 # independent, and that it is pickle-able.
265 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
266 results = parallel(
267 delayed(_fit_and_score)(
--> 268 clone(estimator),
269 X,
270 y,
271 scorers,
272 train,
273 test,
274 verbose,
275 None,
276 fit_params,
277 return_train_score=return_train_score,
278 return_times=True,
279 return_estimator=return_estimator,
280 error_score=error_score,
281 )
282 for train, test in cv.split(X, y, groups)
283 )
285 _warn_or_raise_about_fit_failures(results, error_score)
287 # For callabe scoring, the return type is only know after calling. If the
288 # return type is a dictionary, the error scores can now be inserted with
289 # the correct key.
File c:\Anaconda3_x64\lib\site-packages\sklearn\base.py:89, in clone(estimator, safe)
87 new_object_params = estimator.get_params(deep=False)
88 for name, param in new_object_params.items():
---> 89 new_object_params[name] = clone(param, safe=False)
90 new_object = klass(**new_object_params)
91 params_set = new_object.get_params(deep=False)
File c:\Anaconda3_x64\lib\site-packages\sklearn\base.py:70, in clone(estimator, safe)
68 elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
69 if not safe:
---> 70 return copy.deepcopy(estimator)
71 else:
72 if isinstance(estimator, type):
File c:\Anaconda3_x64\lib\copy.py:161, in deepcopy(x, memo, _nil)
159 reductor = getattr(x, "__reduce_ex__", None)
160 if reductor is not None:
--> 161 rv = reductor(4)
162 else:
163 reductor = getattr(x, "__reduce__", None)
TypeError: cannot pickle 'generator' object
I'm not sure that this is possible before version 1.3 without writing a manual loop to replace
cross_val_score. Besides the generator issue, you're trying to tell the grid search object that it should split all ofX, but it won't see all ofX(it having already been split by the outer splitter).In 1.3, we get metadata routing which automatically routes
groupsto group splitters. Then we can do e.g.Just to check that this really routes to both splitters, here's a modified version of your script:
The outer splits put a single group in the test set, then the inner splits pick one of the remaining three as test and the last two in train. Here's my output: