'''
ํ๊ตญ์ด ํํ ๋ถ์๋ ์๋ฃ๋ฅผ word2Vec๋ชจ๋ธ๋ก ๋ง๋๋ ๋ชจ๋
์
๋ ฅ์ ํํ๋ถ์๋ ํ์ผ์ ์ทจํจ
ํ ์ค์ ํ ๋ฌธ์ฅ์ฉ
์ถ๋ ฅ์ผ๋ก model์ ํ์ฑ
'''
import codecs
import gensim
import multiprocessing
import word2vec
import sys
reload(sys)
sys.setdefaultencoding('utf8')
config = {
'min_count': 15,
'size': 300,
'sg': 1,
'batch_words': 10000,
'iter': 10,
'workers': multiprocessing.cpu_count(),
}
modelTwitterNoStop = gensim.models.Word2Vec(**config)
class SentenceReader:
def __init__(self, filepath):
self.filepath = filepath
def __iter__(self):
for line in codecs.open(self.filepath, encoding='utf-8'):
yield line.split(' ')
sentences_vocab = SentenceReader('corpusAllNewsNoTagNoStop.txt')
sentences_train = SentenceReader('corpusAllNewsNoTagNoStop.txt')
modelTwitterNoStop.build_vocab(sentences_vocab)
modelTwitterNoStop.train(sentences_train)
modelTwitterNoStop.save('modelTwitterNoStop')
import codecs
import gensim
import multiprocessing
import sys
reload(sys)
sys.setdefaultencoding('utf8')
modelTwitterNoStop = gensim.models.Word2Vec.load('modelTwitterNoStop')
print ' '.join(["{}-{}".format(word, value) for word, value in
(modelTwitterNoStop.most_similar(positive=[u"ํ๊ตญ", u"๋์ฟ"], negative=[u"์ ์ธ"], topn=10))])
print "\n"
print ' '.join(["{}-{}".format(word, value) for word, value in
(modelTwitterNoStop.most_similar_cosmul(positive=[u"๋น์ ", u"์ต์์ค"], topn=20))])
print "\n"
print modelTwitterNoStop.doesnt_match(u"์ ์คํ ๊น์ข
๋ฐ๊ทผํ ์ต์์ค".split())
print "\n"
print modelTwitterNoStop.similarity(u"๋น์ ", u"์ ์คํ")
print "\n"
print modelTwitterNoStop
print "\n"
Warning (from the warning module): File "C: \ Python27 \ lib \ site-packages \ gensim \ utils.py", line 840 warnings.warn ("detected by Windows; aliasing chunkize to chunkize_serial"); UserWarning: detected by Windows; aliasing chunkize to chunkize_serial
source
share