Please help me understand the difference between how TaggedDocument and LabeledSentence of gensim . My ultimate goal is text classification using the Doc2Vec model and any classifier. I follow this blog !
class MyLabeledSentences(object): def __init__(self, dirname, dataDct={}, sentList=[]): self.dirname = dirname self.dataDct = {} self.sentList = [] def ToArray(self): for fname in os.listdir(self.dirname): with open(os.path.join(self.dirname, fname)) as fin: for item_no, sentence in enumerate(fin): self.sentList.append(LabeledSentence([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no])) return sentList class MyTaggedDocument(object): def __init__(self, dirname, dataDct={}, sentList=[]): self.dirname = dirname self.dataDct = {} self.sentList = [] def ToArray(self): for fname in os.listdir(self.dirname): with open(os.path.join(self.dirname, fname)) as fin: for item_no, sentence in enumerate(fin): self.sentList.append(TaggedDocument([w for w in sentence.lower().split() if w in stopwords.words('english')], [fname.split('.')[0].strip() + '_%s' % item_no])) return sentList sentences = MyLabeledSentences(some_dir_name) model_l = Doc2Vec(min_count=1, window=10, size=300, sample=1e-4, negative=5, workers=7) sentences_l = sentences.ToArray() model_l.build_vocab(sentences_l ) for epoch in range(15):
My question is model_l.docvecs['some_word'] the same as model_t.docvecs['some_word'] ? Can you provide me with a link to good sources to understand how TaggedDocument or LabeledSentence .
source share