First problem: you have the wrong directory structure. You need it to be like
container_folder/
CLASS_1_folder/
file_1.txt, file_2.txt ...
CLASS_2_folder/
file_1.txt, file_2.txt, ....
, . , train_test_split, .
-,
X_train = vectorizer.fit_transform(text_train_subset)
X_train = vectorizer.fit_transform(text_train_subset.data)
:
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
text_train_subset = load_files('sample-data/web')
text_test_subset = text_train_subset
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(text_train_subset.data)
y_train = text_train_subset.target
classifier = MultinomialNB().fit(X_train, y_train)
print("Training score: {0:.1f}%".format(
classifier.score(X_train, y_train) * 100))
X_test = vectorizer.transform(text_test_subset.data)
y_test = text_test_subset.target
print("Testing score: {0:.1f}%".format(
classifier.score(X_test, y_test) * 100))
sample-data/web
sample-data/web
├── de
│ ├── apollo8.txt
│ ├── fiv.txt
│ ├── habichtsadler.txt
└── en
├── elizabeth_needham.txt
├── equipartition_theorem.txt
├── sunderland_echo.txt
└── thespis.txt