max_features, .
, . () ~ 10 000 . HashVectorizer, , .
path = 'data/products.tsv' products = pd.read_table(path , header= None , names = ['label' , 'entry'])
X = products.entry
y = products.label
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
vect = CountVectorizer(max_features=10000)
vect.fit(X_train.values.astype('U'))
X_train_dtm = vect.transform(X_train)
X_test_dtm = vect.transform(X_test)