In the GroupKFold
source, the random_state
is set to None
def __init__(self, n_splits=3):
super(GroupKFold, self).__init__(n_splits, shuffle=False,
random_state=None)
Hence, when run multiple times (code from here)
import numpy as np
from sklearn.model_selection import GroupKFold
for i in range(0,10):
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
y = np.array([1, 2, 3, 4])
groups = np.array([0, 0, 2, 2])
group_kfold = GroupKFold(n_splits=2)
group_kfold.get_n_splits(X, y, groups)
print(group_kfold)
for train_index, test_index in group_kfold.split(X, y, groups):
print("TRAIN:", train_index, "TEST:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print(X_train, X_test, y_train, y_test)
print
print
o/p
GroupKFold(n_splits=2)
('TRAIN:', array([0, 1]), 'TEST:', array([2, 3]))
(array([[1, 2],
[3, 4]]), array([[5, 6],
[7, 8]]), array([1, 2]), array([3, 4]))
('TRAIN:', array([2, 3]), 'TEST:', array([0, 1]))
(array([[5, 6],
[7, 8]]), array([[1, 2],
[3, 4]]), array([3, 4]), array([1, 2]))
GroupKFold(n_splits=2)
('TRAIN:', array([0, 1]), 'TEST:', array([2, 3]))
(array([[1, 2],
[3, 4]]), array([[5, 6],
[7, 8]]), array([1, 2]), array([3, 4]))
('TRAIN:', array([2, 3]), 'TEST:', array([0, 1]))
(array([[5, 6],
[7, 8]]), array([[1, 2],
[3, 4]]), array([3, 4]), array([1, 2]))
etc ...
The splits are identical.
How do I set a random_state
for GroupKFold
in order to get a different (but repoducible) set of splits over a few different trials of cross validation?
Eg, I want
GroupKFold(n_splits=2, random_state=42)
('TRAIN:', array([0, 1]),
'TEST:', array([2, 3]))
('TRAIN:', array([2, 3]),
'TEST:', array([0, 1]))
GroupKFold(n_splits=2, random_state=13)
('TRAIN:', array([0, 2]),
'TEST:', array([1, 3]))
('TRAIN:', array([1, 3]),
'TEST:', array([0, 2]))
So far, it seems a strategy might be to use a sklearn.utils.shuffle
first, as suggested in this post. However, this actually just rearranges the elements of each fold --- it doesn't give us new splits.
from sklearn.utils import shuffle
from sklearn.model_selection import GroupKFold
import numpy as np
import sys
import pdb
random_state = int(sys.argv[1])
X = np.arange(20).reshape((10,2))
y = np.arange(10)
groups = np.array([0,0,0,1,2,3,4,5,6,7])
def cv(X, y, groups, random_state):
X_s, y_s, groups_s = shuffle(X,y, groups, random_state=random_state)
cv_out = GroupKFold(n_splits=2)
cv_out_splits = cv_out.split(X_s, y_s, groups_s)
for train, test in cv_out_splits:
print "---"
print X_s[test]
print y_s[test]
print "test groups", groups_s[test]
print "train groups", groups_s[train]
pdb.set_trace()
print "***"
cv(X, y, groups, random_state)
The output:
>python sshuf.py 32
***
---
[[ 2 3]
[ 4 5]
[ 0 1]
[ 8 9]
[12 13]]
[1 2 0 4 6]
test groups [0 0 0 2 4]
train groups [7 6 1 3 5]
---
[[18 19]
[16 17]
[ 6 7]
[10 11]
[14 15]]
[9 8 3 5 7]
test groups [7 6 1 3 5]
train groups [0 0 0 2 4]
>python sshuf.py 234
***
---
[[12 13]
[ 4 5]
[ 0 1]
[ 2 3]
[ 8 9]]
[6 2 0 1 4]
test groups [4 0 0 0 2]
train groups [7 3 1 5 6]
---
[[18 19]
[10 11]
[ 6 7]
[14 15]
[16 17]]
[9 5 3 7 8]
test groups [7 3 1 5 6]
train groups [4 0 0 0 2]