Updating to tf.dataset does not work properly when csv parsing

I have a GCMLE experiment and am trying to update my own input_fnto use new features tf.data. I created the following input_fn based on this sample

def input_fn(...):
    dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards) # shuffle up the list of input files
    dataset = dataset.interleave(lambda filename: # mix together records from cycle_length number of shards
                tf.data.TextLineDataset(filename).skip(1).map(lambda row: parse_csv(row, hparams)), cycle_length=5) 
    if shuffle:
      dataset = dataset.shuffle(buffer_size = 10000)
    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)
    iterator = dataset.make_one_shot_iterator()
    features = iterator.get_next()

    labels = features.pop(LABEL_COLUMN)

    return features, labels

my parse_csvis the same as what I used before, but currently it does not work. I can fix some of the problems, but I don’t quite understand why I am having these problems. Here is the beginning of my parse_csv () function

def parse_csv(..):
    columns = tf.decode_csv(rows, record_defaults=CSV_COLUMN_DEFAULTS)
    raw_features = dict(zip(FIELDNAMES, columns))

    words = tf.string_split(raw_features['sentences']) # splitting words
    vocab_table = tf.contrib.lookup.index_table_from_file(vocabulary_file = hparams.vocab_file,
                default_value = 0)

....
  • tf.string_split() , ValueError: Shape must be rank 1 but is rank 0 for 'csv_preprocessing/input_sequence_generation/StringSplit' (op: 'StringSplit') with input shapes: [], []. - , raw_features['sentences'] [raw_features['sentences']], , dataset? ? , words = tf.squeeze(words, 0), "" .

  • - , , tensorflow.python.framework.errors_impl.FailedPreconditionError: Table not initialized., input_fn() (. ), , ? . -, , tf.contrib.lookup.index_table_from_file parse_csv?

, input_fn(), :

def input_fn(...):
    filename_queue = tf.train.string_input_producer(tf.train.match_filenames_once(filenames), 
                num_epochs=num_epochs, shuffle=shuffle, capacity=32)
    reader = tf.TextLineReader(skip_header_lines=skip_header_lines)

    _, rows = reader.read_up_to(filename_queue, num_records=batch_size)

    features = parse_csv(rows, hparams)


        if shuffle:
            features = tf.train.shuffle_batch(
                features,
                batch_size,
                min_after_dequeue=2 * batch_size + 1,
                capacity=batch_size * 10,
                num_threads=multiprocessing.cpu_count(), 
                enqueue_many=True,
                allow_smaller_final_batch=True
            )
        else:
            features = tf.train.batch(
                features,
                batch_size,
                capacity=batch_size * 10,
                num_threads=multiprocessing.cpu_count(),
                enqueue_many=True,
                allow_smaller_final_batch=True
            )

labels = features.pop(LABEL_COLUMN)

return features, labels
+4
2

TF 1.4 ( TF, GCMLE), make_one_shot_iterator() (. ) Dataset.make_initializable_iterator(), iterator.initalizer TABLES_INITIALIZER ( ). input_fn():

def input_fn(...):
  dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards)

  # Define `vocab_table` outside the map function and use it in `parse_csv()`.
  vocab_table = tf.contrib.lookup.index_table_from_file(
      vocabulary_file=hparams.vocab_file, default_value=0)

  dataset = dataset.interleave(
      lambda filename: (tf.data.TextLineDataset(filename)
                        .skip(1)
                        .map(lambda row: parse_csv(row, hparams),
                             num_parallel_calls=multiprocessing.cpu_count())),
      cycle_length=5) 

  if shuffle:
    dataset = dataset.shuffle(buffer_size=10000)
  dataset = dataset.repeat(num_epochs)
  dataset = dataset.batch(batch_size)
  iterator = dataset.make_initializable_iterator()
  features = iterator.get_next()

  # add iterator.intializer to be handled by default table initializers
  tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, iterator.initializer) 

  labels = features.pop(LABEL_COLUMN)

  return features, labels
+1
  • tf.data.TextLineDataset, . tf.TextLineReader.read(), tf.TextLineReader.read_up_to(), . , tf.string_split() op ( ), .

  • tf.data. , Dataset.map() ( ), parse_csv(), vocab_table.lookup(). - :

    def input_fn(...):
      dataset = tf.data.Dataset.list_files(filenames).shuffle(num_shards)
    
      # Define `vocab_table` outside the map function and use it in `parse_csv()`.
      vocab_table = tf.contrib.lookup.index_table_from_file(
          vocabulary_file=hparams.vocab_file, default_value=0)
    
      def parse_csv(...):
        columns = tf.decode_csv(rows, record_defaults=CSV_COLUMN_DEFAULTS)
        raw_features = dict(zip(FIELDNAMES, columns))
        words = tf.string_split([raw_features['sentences']]) # splitting words
    
        # Use the captured `vocab_table` here.
        word_indices = vocab_table.lookup(words)
    
        # ...    
        features = ...
    
        # NOTE: Structure the output here so that you can simply return
        # the dataset from `input_fn()`.
        labels = features.pop(LABEL_COLUMN)
        return features, labels
    
      # NOTE: Consider using `tf.contrib.data.parallel_interleave()` to perform
      # the reads in parallel.
      dataset = dataset.interleave(
          lambda filename: (tf.data.TextLineDataset(filename)
                            .skip(1)
                            .map(lambda row: parse_csv(row, hparams),
                                 num_parallel_calls=multiprocessing.cpu_count())),
          cycle_length=5) 
    
      if shuffle:
        dataset = dataset.shuffle(buffer_size=10000)
      dataset = dataset.repeat(num_epochs)
      dataset = dataset.batch(batch_size)
    
      # NOTE: Add prefetching here to run the input pipeline in the background.
      dataset = dataset.prefetch(1)
    
      # NOTE: This requires TensorFlow 1.5 or later, but this change simplifies the
      # initialization of the lookup table.
      return dataset
    
+1

Source: https://habr.com/ru/post/1693608/


All Articles