I am new to Keras and I am trying to solve the sentence problem using NN in Keras. I use word2vec as an embedding of words and then a Siamese network to predict how similar the two sentences are. The core network for the Siamese network is LSTM, and to combine the two core networks I use a lambda layer with a cosine metric. As a dataset, I use the SICK dataset, which gives an assessment of each pair of sentences, from 1 (different) to 5 (very similar).
I created a network and it works, but I have a lot of doubts: first of all, I’m not sure how I feed LSTM offers. I take a word2vec attachment for each word, and I only create one array per sentence, filling it with zeros in seq_len to get the same length arrays. And then I change it like this:data_A = embedding_A.reshape((len(embedding_A), seq_len, feature_dim))
In addition, I'm not sure that my Siamese network is correct, since many predictions for different pairs are equal, and the losses do not change much (from 0.3300 to 0.2105 in 10 eras, and this does not change much over 100 eras).
Can someone help me find and understand my mistakes? Thanks a lot (and sorry for my bad english)
Interested part in my code
def cosine_distance(vecs):
y_true, y_pred = vecs
y_true = K.l2_normalize(y_true, axis=-1)
y_pred = K.l2_normalize(y_pred, axis=-1)
return K.mean(1 - K.sum((y_true * y_pred), axis=-1))
def cosine_dist_output_shape(shapes):
shape1, shape2 = shapes
print((shape1[0], 1))
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(feature_dim,seq_len):
model = Sequential()
model.add(LSTM(100, batch_input_shape=(1,seq_len,feature_dim),return_sequences=True))
model.add(Dense(50, activation='relu'))
model.add(Dense(10, activation='relu'))
return model
def siamese(feature_dim,seq_len, epochs, tr_dataA, tr_dataB, tr_y, te_dataA, te_dataB, te_y):
base_network = create_base_network(feature_dim,seq_len)
input_a = Input(shape=(seq_len,feature_dim,))
input_b = Input(shape=(seq_len,feature_dim))
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(cosine_distance, output_shape=cosine_dist_output_shape)([processed_a, processed_b])
model = Model([input_a, input_b], distance)
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss=contrastive_loss)
model.fit([tr_dataA, tr_dataB], tr_y,
batch_size=128,
epochs=epochs,
validation_data=([te_dataA, te_dataB], te_y))
pred = model.predict([tr_dataA, tr_dataB])
tr_acc = compute_accuracy(pred, tr_y)
for i in range(len(pred)):
print (pred[i], tr_y[i])
return model
def padding(max_len, embedding):
for i in range(len(embedding)):
padding = np.zeros(max_len-embedding[i].shape[0])
embedding[i] = np.concatenate((embedding[i], padding))
embedding = np.array(embedding)
return embedding
def getAB(sentences_A,sentences_B, feature_dim, word2idx, idx2word, weights,max_len_def=0):
embedding_A, max_len_A = from_sentence_to_array(sentences_A,word2idx, idx2word, weights)
embedding_B, max_len_B = from_sentence_to_array(sentences_B,word2idx, idx2word, weights)
max_len = max(max_len_A, max_len_B,max_len_def*feature_dim)
embedding_A = padding(max_len, embedding_A)
embedding_B = padding(max_len, embedding_B)
seq_len = int(max_len/feature_dim)
print(seq_len)
data_A = embedding_A.reshape((len(embedding_A), seq_len, feature_dim))
data_B = embedding_B.reshape((len(embedding_B), seq_len, feature_dim))
print('A,B shape: ',data_A.shape, data_B.shape)
return data_A, data_B, seq_len
FEATURE_DIMENSION = 100
MIN_COUNT = 10
WINDOW = 5
if __name__ == '__main__':
data = pd.read_csv('data\\train.csv', sep='\t')
sentences_A = data['sentence_A']
sentences_B = data['sentence_B']
tr_y = 1- data['relatedness_score']/5
if not (os.path.exists(EMBEDDING_PATH) and os.path.exists(VOCAB_PATH)):
create_embeddings(embeddings_path=EMBEDDING_PATH, vocab_path=VOCAB_PATH, size=FEATURE_DIMENSION, min_count=MIN_COUNT, window=WINDOW, sg=1, iter=25)
word2idx, idx2word, weights = load_vocab_and_weights(VOCAB_PATH,EMBEDDING_PATH)
tr_dataA, tr_dataB, seq_len = getAB(sentences_A,sentences_B, FEATURE_DIMENSION,word2idx, idx2word, weights)
test = pd.read_csv('data\\test.csv', sep='\t')
test_sentences_A = test['sentence_A']
test_sentences_B = test['sentence_B']
te_y = 1- test['relatedness_score']/5
te_dataA, te_dataB, seq_len = getAB(test_sentences_A,test_sentences_B, FEATURE_DIMENSION,word2idx, idx2word, weights, seq_len)
model = siamese(FEATURE_DIMENSION, seq_len, 10, tr_dataA, tr_dataB, tr_y, te_dataA, te_dataB, te_y)
test_a = ['this is my dog']
test_b = ['this dog is mine']
a,b,seq_len = getAB(test_a,test_b, FEATURE_DIMENSION,word2idx, idx2word, weights, seq_len)
prediction = model.predict([a, b])
print(prediction)
Some of the results:
my prediction | true label
0.849908 0.8
0.849908 0.8
0.849908 0.74
0.849908 0.76
0.849908 0.66
0.849908 0.72
0.849908 0.64
0.849908 0.8
0.849908 0.78
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.74
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.66
0.849908 0.8
0.849908 0.66
0.849908 0.56
0.849908 0.8
0.849908 0.8
0.849908 0.76
0.847546 0.78
0.847546 0.8
0.847546 0.74
0.847546 0.76
0.847546 0.72
0.847546 0.8
0.847546 0.78
0.847546 0.8
0.847546 0.72
0.847546 0.8
0.847546 0.8
0.847546 0.78
0.847546 0.8
0.847546 0.78
0.847546 0.78
0.847546 0.46
0.847546 0.72
0.847546 0.8
0.847546 0.76
0.847546 0.8
0.847546 0.8
0.847546 0.8
0.847546 0.8
0.847546 0.74
0.847546 0.8
0.847546 0.72
0.847546 0.68
0.847546 0.56
0.847546 0.8
0.847546 0.78
0.847546 0.78
0.847546 0.8
0.852975 0.64
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.44
0.852975 0.72
0.852975 0.8
0.852975 0.8
0.852975 0.76
0.852975 0.8
0.852975 0.8
0.852975 0.8
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.76
0.852975 0.8