- , . :
, , . . , unigram[word]
, . . unigram , , , .
perplexity = 1
N = 0
for word in testset:
if word in unigram:
N += 1
perplexity = perplexity * (1/unigram[word])
perplexity = pow(perplexity, 1/float(N))
:
, .
, :
corpus ="""
Monty Python (sometimes known as The Pythons) were a British surreal comedy group who created the sketch comedy show Monty Python Flying Circus,
that first aired on the BBC on October 5, 1969. Forty-five episodes were made over four series. The Python phenomenon developed from the television series
into something larger in scope and impact, spawning touring stage shows, films, numerous albums, several books, and a stage musical.
The group influence on comedy has been compared to The Beatles' influence on music."""
:
import collections, nltk
tokens = nltk.word_tokenize(corpus)
def unigram(tokens):
model = collections.defaultdict(lambda: 0.01)
for f in tokens:
try:
model[f] += 1
except KeyError:
model [f] = 1
continue
N = float(sum(model.values()))
for word in model:
model[word] = model[word]/N
return model
. , , 0.01
. , :
def perplexity(testset, model):
testset = testset.split()
perplexity = 1
N = 0
for word in testset:
N += 1
perplexity = perplexity * (1/model[word])
perplexity = pow(perplexity, 1/float(N))
return perplexity
:
testset1 = "Monty"
testset2 = "abracadabra gobbledygook rubbish"
model = unigram(tokens)
print perplexity(testset1, model)
print perplexity(testset2, model)
:
>>>
49.09452736318415
100.0
, , , . , , , . " Monty
, .