, , , train_test_split() , stratify. , , , , .
train_test_split() StratifiedShuffleSplit, np.unique() on y ( , stratify). :
classes, y_indices = np.unique(y, return_inverse=True)
n_classes = classes.shape[0]
, , :
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
N = 20
a = np.arange(N)
b = np.random.choice(["foo","bar"], size=N)
c = np.random.choice(["y","z"], size=N)
df = pd.DataFrame({'a':a, 'b':b, 'c':c})
print(df)
a b c
0 0 bar y
1 1 foo y
2 2 bar z
3 3 bar y
4 4 foo z
5 5 bar y
...
, : foo, bar, y z. , y z b == foo b == bar, , .
train, test = train_test_split(df, test_size=0.2, random_state=0,
stratify=df[['b', 'c']])
print(len(train.a.values)) # 16
print(len(set(train.a.values))) # 12
print(train)
a b c
3 3 bar y # selecting a = 3 for b = bar*
5 5 bar y
13 13 foo y
4 4 foo z
14 14 bar z
10 10 foo z
3 3 bar y # selecting a = 3 for c = y
6 6 bar y
16 16 foo y
18 18 bar z
6 6 bar y
8 8 foo y
18 18 bar z
7 7 bar z
4 4 foo z
19 19 bar y
: , df.b df.c ? , , . , , train_test_split .
, .