I use word2vec to represent a small phrase (3 to 4 words) as a unique vector, either by adding each individual word attachment, or by calculating the average value of the word attachments.
From the experiments that I did, I always get the same resemblance to cosine. I suspect that this is due to the fact that the word vectors generated by word2vec are normalized per unit length (Euclidean norm) after training? or either I have a mistake in the code, or I'm missing something.
Here is the code:
import numpy as np from nltk import PunktWordTokenizer from gensim.models import Word2Vec from numpy.linalg import norm from scipy.spatial.distance import cosine def pattern2vector(tokens, word2vec, AVG=False): pattern_vector = np.zeros(word2vec.layer1_size) n_words = 0 if len(tokens) > 1: for t in tokens: try: vector = word2vec[t.strip()] pattern_vector = np.add(pattern_vector,vector) n_words += 1 except KeyError, e: continue if AVG is True: pattern_vector = np.divide(pattern_vector,n_words) elif len(tokens) == 1: try: pattern_vector = word2vec[tokens[0].strip()] except KeyError: pass return pattern_vector def main(): print "Loading word2vec model ...\n" word2vecmodelpath = "/data/word2vec/vectors_200.bin" word2vec = Word2Vec.load_word2vec_format(word2vecmodelpath, binary=True) pattern_1 = 'founder and ceo' pattern_2 = 'co-founder and former chairman' tokens_1 = PunktWordTokenizer().tokenize(pattern_1) tokens_2 = PunktWordTokenizer().tokenize(pattern_2) print "vec1", tokens_1 print "vec2", tokens_2 p1 = pattern2vector(tokens_1, word2vec, False) p2 = pattern2vector(tokens_2, word2vec, False) print "\nSUM" print "dot(vec1,vec2)", np.dot(p1,p2) print "norm(p1)", norm(p1) print "norm(p2)", norm(p2) print "dot((norm)vec1,norm(vec2))", np.dot(norm(p1),norm(p2)) print "cosine(vec1,vec2)", np.divide(np.dot(p1,p2),np.dot(norm(p1),norm(p2))) print "\n" print "AVG" p1 = pattern2vector(tokens_1, word2vec, True) p2 = pattern2vector(tokens_2, word2vec, True) print "dot(vec1,vec2)", np.dot(p1,p2) print "norm(p1)", norm(p1) print "norm(p2)", norm(p2) print "dot(norm(vec1),norm(vec2))", np.dot(norm(p1),norm(p2)) print "cosine(vec1,vec2)", np.divide(np.dot(p1,p2),np.dot(norm(p1),norm(p2))) if __name__ == "__main__": main()
and here is the result:
Loading word2vec model ... Dimensions 200 vec1 ['founder', 'and', 'ceo'] vec2 ['co-founder', 'and', 'former', 'chairman'] SUM dot(vec1,vec2) 5.4008677771 norm(p1) 2.19382594282 norm(p2) 2.87226958166 dot((norm)vec1,norm(vec2)) 6.30125952303 cosine(vec1,vec2) 0.857109242583 AVG dot(vec1,vec2) 0.450072314758 norm(p1) 0.731275314273 norm(p2) 0.718067395416 dot(norm(vec1),norm(vec2)) 0.525104960252 cosine(vec1,vec2) 0.857109242583
I use cosine semblance as defined here Cosine resemblance (Wikipedia) . Values โโof norms and point products are really different.
Can anyone explain why the cosine is the same?
Thanks David