Short answer:
- You need to pass the global step to the optimizer, which you pass mon_sess.run. This allows you to save and retrieve saved control points.
- + MonitoredTrainingSession. -, ( , ). -, - to mon_sess.run() - , (/, ) . , , , test_loss (/ , ). , , .
:
, , tf.train.MonitoredSession(tf.train.MonitoredTrainingSession tf.train.MonitoredSession, ).
, , 5 './ckpt_dir'. , :
def train(inputs, labels_onehot, global_step):
out = tf.contrib.layers.fully_connected(
inputs,
num_outputs=10,
activation_fn=tf.nn.sigmoid)
loss = tf.reduce_mean(
tf.reduce_sum(
tf.nn.sigmoid_cross_entropy_with_logits(
logits=out,
labels=labels_onehot), axis=1))
train_op = opt.minimize(loss, global_step=global_step)
return train_op
with tf.Graph().as_default():
global_step = tf.train.get_or_create_global_step()
inputs = ...
labels_onehot = ...
train_op = train(inputs, labels_onehot, global_step)
with tf.train.MonitoredTrainingSession(
checkpoint_dir='./ckpt_dir',
save_checkpoint_secs=5,
hooks=[ ... ] # Choose your hooks
) as mon_sess:
while not mon_sess.should_stop():
mon_sess.run(train_op)
MonitoredTrainingSession, , :
- tf.train.MonitoredTrainingSession tf.train.Scaffold, ; , , .
- tf.train.ChiefSessionCreator. , , tf . , , , , ..
- tf.train.CheckpointSaverHook, .
, tf.train.CheckpointSaverHook tf.train.ChiefSessionCreator . tf.train.MonitoredTrainingSession , :
checkpoint_dir = './ckpt_dir'
scaffold = tf.train.Scaffold()
saverhook = tf.train.CheckpointSaverHook(
checkpoint_dir=checkpoint_dir,
save_secs=5
scaffold=scaffold
)
session_creator = tf.train.ChiefSessionCreator(
scaffold=scaffold,
checkpoint_dir=checkpoint_dir
)
with tf.train.MonitoredSession(
session_creator=session_creator,
hooks=[saverhook]) as mon_sess:
while not mon_sess.should_stop():
mon_sess.run(train_op)
+ , , ( while):
mon_sess.run([train_op, cross_validation_loss])
, validation_loss . , , .