Problems installing gensim

# -*- coding: utf-8 -*- 

'''
ํ•œ๊ตญ์–ด ํ˜•ํƒœ ๋ถ„์„๋œ ์ž๋ฃŒ๋ฅผ word2Vec๋ชจ๋ธ๋กœ ๋งŒ๋“œ๋Š” ๋ชจ๋“ˆ
์ž…๋ ฅ์€ ํ˜•ํƒœ๋ถ„์„๋œ ํŒŒ์ผ์„ ์ทจํ•จ
ํ•œ ์ค„์— ํ•œ ๋ฌธ์žฅ์”ฉ
์ถœ๋ ฅ์œผ๋กœ model์„ ํ˜•์„ฑ  
'''

import codecs
import gensim
import multiprocessing
import word2vec

import sys
reload(sys)
sys.setdefaultencoding('utf8')

# ๋ชจ๋ธ configuration ์„ค์ • 
config = {
   'min_count': 15,  # ๋“ฑ์žฅ ํšŸ์ˆ˜๊ฐ€ 15 ์ดํ•˜์ธ ๋‹จ์–ด๋Š” ๋ฌด์‹œ
   'size': 300,  # 300์ฐจ์›์งœ๋ฆฌ ๋ฒกํ„ฐ์ŠคํŽ˜์ด์Šค์— embedding
   'sg': 1,  # 0์ด๋ฉด CBOW, 1์ด๋ฉด skip-gram
    'batch_words': 10000,  # ์‚ฌ์ „์„ ๊ตฌ์ถ•ํ• ๋•Œ ํ•œ๋ฒˆ์— ์ฝ์„ ๋‹จ์–ด ์ˆ˜
    'iter': 10,  # ๋ณดํ†ต ๋”ฅ๋Ÿฌ๋‹์—์„œ ๋งํ•˜๋Š” epoch๊ณผ ๋น„์Šทํ•œ, ๋ฐ˜๋ณต ํšŸ์ˆ˜
    'workers': multiprocessing.cpu_count(),
}

modelTwitterNoStop = gensim.models.Word2Vec(**config)

class SentenceReader:

    def __init__(self, filepath):
        self.filepath = filepath


    def __iter__(self):
         for line in codecs.open(self.filepath, encoding='utf-8'):
             yield line.split(' ')

# ์‚ฌ์ „๊ณผ ํ•™์Šต์„ ํ˜•ํƒœ๋ถ„์„๋œ ํŒŒ์ผ์„ ๊ฐ€์ง€๊ณ  ํ–‰ํ•จ
sentences_vocab = SentenceReader('corpusAllNewsNoTagNoStop.txt')
sentences_train = SentenceReader('corpusAllNewsNoTagNoStop.txt')

#model = gensim.models.Word2Vec()
modelTwitterNoStop.build_vocab(sentences_vocab)
modelTwitterNoStop.train(sentences_train)

#์ด๋ ‡๊ฒŒ ํ•™์Šต๋œ ๋ชจ๋ธ์„ ์ €์žฅ
modelTwitterNoStop.save('modelTwitterNoStop')


### ๋ชจ๋ธ ํ…Œ์ŠคํŠธ ##########
#๋‹ค์Œ ๋ผ์ธ๋ถ€ํ„ฐ๋Š” ๋ถ„๋ฆฌํ•˜์—ฌ ๋ณ„๋„๋กœ Test๋ชจ๋ธ ํ”„๋กœ๊ทธ๋žจ์„ ๋งŒ๋“ค์–ด ๋‹ค์–‘ํ•œ ๊ด€๊ณ„๋ฅผ ์‹คํ—˜ํ•ด ๋ณผ ์ˆ˜ ์žˆ์Œ

import codecs
import gensim
import multiprocessing

import sys
reload(sys)
sys.setdefaultencoding('utf8')

##๋„ ํฌํ•จ


#๋งŒ๋“ค์–ด ๋†“์€ gensim ๋ชจ๋ธ์„ ๋กœ๋”ฉ
modelTwitterNoStop = gensim.models.Word2Vec.load('modelTwitterNoStop')


#most similar Test - ์ƒ์œ„ 10๊ฐœ์˜ ์œ ์‚ฌํ•œ ๋ฒกํ„ฐ๋ฅผ ์ฐพ์Œ.. ์‹ค์ œ๋กœ '์ผ๋ณธ'์ด ๊ฐ€์žฅ ๋†’์€    ๊ฒƒ์œผ๋กœ ๋‚˜
print ' '.join(["{}-{}".format(word, value) for word, value in 
(modelTwitterNoStop.most_similar(positive=[u"ํ•œ๊ตญ", u"๋„์ฟ„"], negative=[u"์„œ    ์šธ"], topn=10))])

print "\n"

# positive๋งŒ ์‚ฌ์šฉํ•ด์„œ๋„ ํ•  ์ˆ˜ ์žˆ์œผ๋ฉฐ cosmul์„ ์ด์šฉ
print ' '.join(["{}-{}".format(word, value) for word, value in 
(modelTwitterNoStop.most_similar_cosmul(positive=[u"๋น„์„ ", u"์ตœ์ˆœ์‹ค"], topn=20))])
print "\n"

#doesn't match Test
print modelTwitterNoStop.doesnt_match(u"์ •์œคํšŒ ๊น€์ข… ๋ฐ•๊ทผํ˜œ ์ตœ์ˆœ์‹ค".split())

print "\n"

#similarity Test
print modelTwitterNoStop.similarity(u"๋น„์„ ", u"์ •์œคํšŒ")
print "\n"

# no.of vocab.. in this model
print modelTwitterNoStop
print "\n"

Warning (from the warning module): File "C: \ Python27 \ lib \ site-packages \ gensim \ utils.py", line 840 warnings.warn ("detected by Windows; aliasing chunkize to chunkize_serial"); UserWarning: detected by Windows; aliasing chunkize to chunkize_serial

+4
source share
2 answers

It's not a problem. Just for windows it is handled differently. If you do not want to see this message, you can use the code from this answer

0
source

, . , gensim. , .

0

Source: https://habr.com/ru/post/1662271/


All Articles