most_informative_features()
NLTK/Python:
import string
from itertools import chain
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
import nltk
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]
word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = list(word_features.keys())[:100]
numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]]
classifier = nbc.train(train_set)
:
print classifier.most_informative_features()
[]:
[('turturro', True),
('inhabiting', True),
('taboo', True),
('conflicted', True),
('overacts', True),
('rescued', True),
('stepdaughter', True),
('apologizing', True),
('pup', True),
('inform', True)]
:
classifier.most_informative_features(n=len(word_features))
[]:
[('turturro', True),
('inhabiting', True),
('taboo', True),
('conflicted', True),
('overacts', True),
('rescued', True),
('stepdaughter', True),
('apologizing', True),
('pup', True),
('inform', True),
('commercially', True),
('utilize', True),
('gratuitous', True),
('visible', True),
('internet', True),
('disillusioned', True),
('boost', True),
('preventing', True),
('built', True),
('repairs', True),
('overplaying', True),
('election', True),
('caterer', True),
('decks', True),
('retiring', True),
('pivot', True),
('outwitting', True),
('solace', True),
('benches', True),
('terrorizes', True),
('billboard', True),
('catalogue', True),
('clean', True),
('skits', True),
('nice', True),
('feature', True),
('must', True),
('withdrawn', True),
('indulgence', True),
('tribal', True),
('freeman', True),
('must', False),
('nice', False),
('feature', False),
('gratuitous', False),
('turturro', False),
('built', False),
('internet', False),
('rescued', False),
('clean', False),
('overacts', False),
('gregor', False),
('conflicted', False),
('taboo', False),
('inhabiting', False),
('utilize', False),
('churns', False),
('boost', False),
('stepdaughter', False),
('complementary', False),
('gleiberman', False),
('skylar', False),
('kirkpatrick', False),
('hardship', False),
('election', False),
('inform', False),
('disillusioned', False),
('visible', False),
('commercially', False),
('frosted', False),
('pup', False),
('apologizing', False),
('freeman', False),
('preventing', False),
('nutsy', False),
('intrinsics', False),
('somalia', False),
('coordinators', False),
('strengthening', False),
('impatience', False),
('subtely', False),
('426', False),
('schreber', False),
('brimley', False),
('motherload', False),
('creepily', False),
('perturbed', False),
('accountants', False),
('beringer', False),
('scrubs', False),
('1830s', False),
('analogue', False),
('espouses', False),
('xv', False),
('skits', False),
('solace', False),
('reduncancy', False),
('parenthood', False),
('insulators', False),
('mccoll', False)]
:
>>> type(classifier.most_informative_features(n=len(word_features)))
list
>>> type(classifier.most_informative_features(10)[0][1])
bool
, , , , most_informative_features()
, .
import string
from itertools import chain
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
import nltk
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]
word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = list(word_features.keys())[:100]
numtrain = int(len(documents) * 90 / 100)
train_set = [({i:'positive' if (i in tokens) else 'negative' for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:'positive' if (i in tokens) else 'negative' for i in word_features}, tag) for tokens,tag in documents[numtrain:]]
classifier = nbc.train(train_set)
>>> classifier.most_informative_features(10)
[('turturro', 'positive'),
('inhabiting', 'positive'),
('conflicted', 'positive'),
('taboo', 'positive'),
('overacts', 'positive'),
('rescued', 'positive'),
('stepdaughter', 'positive'),
('pup', 'positive'),
('apologizing', 'positive'),
('inform', 'positive')]
>>> type(classifier.most_informative_features(10)[0][1])
str