Use Pipeline :
>>> from sklearn.cluster import KMeans >>> from sklearn.decomposition import RandomizedPCA >>> from sklearn.decomposition import TruncatedSVD >>> from sklearn.feature_extraction.text import CountVectorizer >>> from sklearn.pipeline import make_pipeline >>> sentences = [ ... "fix grammatical or spelling errors", ... "clarify meaning without changing it", ... "correct minor mistakes", ... "add related resources or links", ... "always respect the original author" ... ] >>> vectorizer = CountVectorizer(min_df=1) >>> svd = TruncatedSVD(n_components=5) >>> km = KMeans(n_clusters=2, init='random', n_init=1) >>> pipe = make_pipeline(vectorizer, svd, km) >>> pipe.fit(sentences) Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None, stop_words=None,...n_init=1, n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001, verbose=1))]) >>> pipe.predict(["hello, world"]) array([0], dtype=int32)
(Display TruncatedSVD because RandomizedPCA will stop working with frequency matrices in the upcoming version, but it actually did SVD, not a full PCA.)
source share