I have a GCMLE experiment and am trying to update my own input_fnto use new features tf.data. I created the following input_fn based on this sample
def input_fn(...):
dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards) # shuffle up the list of input files
dataset = dataset.interleave(lambda filename: # mix together records from cycle_length number of shards
tf.data.TextLineDataset(filename).skip(1).map(lambda row: parse_csv(row, hparams)), cycle_length=5)
if shuffle:
dataset = dataset.shuffle(buffer_size = 10000)
dataset = dataset.repeat(num_epochs)
dataset = dataset.batch(batch_size)
iterator = dataset.make_one_shot_iterator()
features = iterator.get_next()
labels = features.pop(LABEL_COLUMN)
return features, labels
my parse_csvis the same as what I used before, but currently it does not work. I can fix some of the problems, but I don’t quite understand why I am having these problems. Here is the beginning of my parse_csv () function
def parse_csv(..):
columns = tf.decode_csv(rows, record_defaults=CSV_COLUMN_DEFAULTS)
raw_features = dict(zip(FIELDNAMES, columns))
words = tf.string_split(raw_features['sentences']) # splitting words
vocab_table = tf.contrib.lookup.index_table_from_file(vocabulary_file = hparams.vocab_file,
default_value = 0)
....
tf.string_split() , ValueError: Shape must be rank 1 but is rank 0 for 'csv_preprocessing/input_sequence_generation/StringSplit' (op: 'StringSplit') with input shapes: [], []. - , raw_features['sentences'] [raw_features['sentences']], , dataset? ? , words = tf.squeeze(words, 0), "" .
- , , tensorflow.python.framework.errors_impl.FailedPreconditionError: Table not initialized., input_fn() (. ), , ? . -, , tf.contrib.lookup.index_table_from_file parse_csv?
, input_fn(), :
def input_fn(...):
filename_queue = tf.train.string_input_producer(tf.train.match_filenames_once(filenames),
num_epochs=num_epochs, shuffle=shuffle, capacity=32)
reader = tf.TextLineReader(skip_header_lines=skip_header_lines)
_, rows = reader.read_up_to(filename_queue, num_records=batch_size)
features = parse_csv(rows, hparams)
if shuffle:
features = tf.train.shuffle_batch(
features,
batch_size,
min_after_dequeue=2 * batch_size + 1,
capacity=batch_size * 10,
num_threads=multiprocessing.cpu_count(),
enqueue_many=True,
allow_smaller_final_batch=True
)
else:
features = tf.train.batch(
features,
batch_size,
capacity=batch_size * 10,
num_threads=multiprocessing.cpu_count(),
enqueue_many=True,
allow_smaller_final_batch=True
)
labels = features.pop(LABEL_COLUMN)
return features, labels