Updating to tf.dataset does not work properly when csv parsing

Question

Updating to tf.dataset does not work properly when csv parsing

I have a GCMLE experiment and am trying to update my own input_fnto use new features tf.data. I created the following input_fn based on this sample

def input_fn(...):
    dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards) # shuffle up the list of input files
    dataset = dataset.interleave(lambda filename: # mix together records from cycle_length number of shards
                tf.data.TextLineDataset(filename).skip(1).map(lambda row: parse_csv(row, hparams)), cycle_length=5) 
    if shuffle:
      dataset = dataset.shuffle(buffer_size = 10000)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()

    labels = features.pop(LABEL_COLUMN)

    return features, labels

my parse_csvis the same as what I used before, but currently it does not work. I can fix some of the problems, but I don’t quite understand why I am having these problems. Here is the beginning of my parse_csv () function

def parse_csv(..):
    columns = tf.decode_csv(rows, record_defaults=CSV_COLUMN_DEFAULTS)
    raw_features = dict(zip(FIELDNAMES, columns))

    words = tf.string_split(raw_features['sentences']) # splitting words
    vocab_table = tf.contrib.lookup.index_table_from_file(vocabulary_file = hparams.vocab_file,
                default_value = 0)

....

tf.string_split() , ValueError: Shape must be rank 1 but is rank 0 for 'csv_preprocessing/input_sequence_generation/StringSplit' (op: 'StringSplit') with input shapes: [], []. - , raw_features['sentences'] [raw_features['sentences']], , dataset? ? , words = tf.squeeze(words, 0), "" .
- , , tensorflow.python.framework.errors_impl.FailedPreconditionError: Table not initialized., input_fn() (. ), , ? . -, , tf.contrib.lookup.index_table_from_file parse_csv?

, input_fn(), :

def input_fn(...):
    filename_queue = tf.train.string_input_producer(tf.train.match_filenames_once(filenames), 
                num_epochs=num_epochs, shuffle=shuffle, capacity=32)
    reader = tf.TextLineReader(skip_header_lines=skip_header_lines)

    _, rows = reader.read_up_to(filename_queue, num_records=batch_size)

    features = parse_csv(rows, hparams)


        if shuffle:
            features = tf.train.shuffle_batch(
                features,
                batch_size,
                min_after_dequeue=2 * batch_size + 1,
                capacity=batch_size * 10,
                num_threads=multiprocessing.cpu_count(), 
                enqueue_many=True,
                allow_smaller_final_batch=True
            )
        else:
            features = tf.train.batch(
                features,
                batch_size,
                capacity=batch_size * 10,
                num_threads=multiprocessing.cpu_count(),
                enqueue_many=True,
                allow_smaller_final_batch=True
            )

labels = features.pop(LABEL_COLUMN)

return features, labels

+4

tensorflow google-cloud-ml tensorflow-datasets

reese0106 14 . '18 3:43

2

tf.data.TextLineDataset, . tf.TextLineReader.read(), tf.TextLineReader.read_up_to(), . , tf.string_split() op ( ), .

tf.data. , Dataset.map() ( ), parse_csv(), vocab_table.lookup(). - :

def input_fn(...):
  dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards)

  # Define `vocab_table` outside the map function and use it in `parse_csv()`.
  vocab_table = tf.contrib.lookup.index_table_from_file(
      vocabulary_file=hparams.vocab_file, default_value=0)

  def parse_csv(...):
    columns = tf.decode_csv(rows, record_defaults=CSV_COLUMN_DEFAULTS)
    raw_features = dict(zip(FIELDNAMES, columns))
    words = tf.string_split([raw_features['sentences']]) # splitting words

    # Use the captured `vocab_table` here.
    word_indices = vocab_table.lookup(words)

    # ...    
    features = ...

    # NOTE: Structure the output here so that you can simply return
    # the dataset from `input_fn()`.
    labels = features.pop(LABEL_COLUMN)
    return features, labels

  # NOTE: Consider using `tf.contrib.data.parallel_interleave()` to perform
  # the reads in parallel.
  dataset = dataset.interleave(
      lambda filename: (tf.data.TextLineDataset(filename)
                        .skip(1)
                        .map(lambda row: parse_csv(row, hparams),
                             num_parallel_calls=multiprocessing.cpu_count())),
      cycle_length=5) 

  if shuffle:
    dataset = dataset.shuffle(buffer_size=10000)
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)

  # NOTE: Add prefetching here to run the input pipeline in the background.
  dataset = dataset.prefetch(1)

  # NOTE: This requires TensorFlow 1.5 or later, but this change simplifies the
  # initialization of the lookup table.
  return dataset

+1

mrry 14 . '18 5:37

reese0106 · Accepted Answer · 2018-02-14T14:34:56+0000

TF 1.4 ( TF, GCMLE), make_one_shot_iterator() (. ) Dataset.make_initializable_iterator(), iterator.initalizer TABLES_INITIALIZER ( ). input_fn():

def input_fn(...):
  dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards)

  # Define `vocab_table` outside the map function and use it in `parse_csv()`.
  vocab_table = tf.contrib.lookup.index_table_from_file(
      vocabulary_file=hparams.vocab_file, default_value=0)

  dataset = dataset.interleave(
      lambda filename: (tf.data.TextLineDataset(filename)
                        .skip(1)
                        .map(lambda row: parse_csv(row, hparams),
                             num_parallel_calls=multiprocessing.cpu_count())),
      cycle_length=5) 

  if shuffle:
    dataset = dataset.shuffle(buffer_size=10000)
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)
  iterator = dataset.make_initializable_iterator()
  features = iterator.get_next()

  # add iterator.intializer to be handled by default table initializers
  tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) 

  labels = features.pop(LABEL_COLUMN)

  return features, labels

Updating to tf.dataset does not work properly when csv parsing

More articles: