From 617a7940a78c269527c20512cd5ab871806c6363 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Fri, 7 Jul 2017 17:24:00 +0000 Subject: [PATCH 1/3] Change mode to executable --- tensor2tensor/bin/t2t-datagen | 0 tensor2tensor/bin/t2t-trainer | 0 2 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tensor2tensor/bin/t2t-datagen mode change 100644 => 100755 tensor2tensor/bin/t2t-trainer diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen old mode 100644 new mode 100755 diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer old mode 100644 new mode 100755 From a2b1c60f7f92eb25ae430e8b82ea1e88e784bb00 Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Fri, 14 Jul 2017 17:32:57 +0000 Subject: [PATCH 2/3] Used regex in _unescape_token() --- tensor2tensor/data_generators/text_encoder.py | 48 ++++++++----------- .../data_generators/tokenizer_test.py | 2 +- 2 files changed, 20 insertions(+), 30 deletions(-) mode change 100644 => 100755 tensor2tensor/data_generators/tokenizer_test.py diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 38b78256d..c477b1e43 100755 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -24,6 +24,7 @@ from __future__ import print_function from collections import defaultdict +import re # Dependency imports @@ -212,6 +213,7 @@ class SubwordTextEncoder(TextEncoder): def __init__(self, filename=None): """Initialize and read from a file, if provided.""" + self._alphabet = set() if filename is not None: self._load_from_file(filename) super(SubwordTextEncoder, self).__init__(num_reserved_ids=None) @@ -490,6 +492,12 @@ def _escape_token(self, token): ret += u"\\%d;" % ord(c) return ret + # Regular expression for unescaping token strings + # '\u' is converted to '_' + # '\\' is converted to '\' + # '\213;' is converted to unichr(213) + _UNESCAPE_REGEX = re.compile(u'|'.join([r"\\u", r"\\\\", r"\\([0-9]+);"])) + def _unescape_token(self, escaped_token): """Inverse of _escape_token(). @@ -498,32 +506,14 @@ def _unescape_token(self, escaped_token): Returns: token: a unicode string """ - ret = u"" - escaped_token = escaped_token[:-1] - pos = 0 - while pos < len(escaped_token): - c = escaped_token[pos] - if c == "\\": - pos += 1 - if pos >= len(escaped_token): - break - c = escaped_token[pos] - if c == u"u": - ret += u"_" - pos += 1 - elif c == "\\": - ret += u"\\" - pos += 1 - else: - semicolon_pos = escaped_token.find(u";", pos) - if semicolon_pos == -1: - continue - try: - ret += unichr(int(escaped_token[pos:semicolon_pos])) - pos = semicolon_pos + 1 - except (ValueError, OverflowError) as _: - pass - else: - ret += c - pos += 1 - return ret + def match(m): + if m.group(1) is not None: + # Convert '\213;' to unichr(213) + try: + return unichr(int(m.group(1))) + except (ValueError, OverflowError) as _: + return "" + # Convert '\u' to '_' and '\\' to '\' + return u"_" if m.group(0) == u"\\u" else u"\\" + # Cut off the trailing underscore and apply the regex substitution + return self._UNESCAPE_REGEX.sub(match, escaped_token[:-1]) diff --git a/tensor2tensor/data_generators/tokenizer_test.py b/tensor2tensor/data_generators/tokenizer_test.py old mode 100644 new mode 100755 index 404a11396..e90c85031 --- a/tensor2tensor/data_generators/tokenizer_test.py +++ b/tensor2tensor/data_generators/tokenizer_test.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Copyright 2017 Google Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -12,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# coding=utf-8 """Tests for tensor2tensor.data_generators.tokenizer.""" from __future__ import absolute_import From e2ed8ed3b55f64c05688cb8852f465131140fa2e Mon Sep 17 00:00:00 2001 From: vthorsteinsson Date: Mon, 17 Jul 2017 17:04:31 +0000 Subject: [PATCH 3/3] Bug fixes in generator_utils and trainer_utils --- tensor2tensor/data_generators/generator_utils.py | 7 ++++--- tensor2tensor/utils/trainer_utils.py | 9 ++++++--- 2 files changed, 10 insertions(+), 6 deletions(-) mode change 100644 => 100755 tensor2tensor/data_generators/generator_utils.py mode change 100644 => 100755 tensor2tensor/utils/trainer_utils.py diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py old mode 100644 new mode 100755 index 890f92c2a..cacad12fc --- a/tensor2tensor/data_generators/generator_utils.py +++ b/tensor2tensor/data_generators/generator_utils.py @@ -324,6 +324,7 @@ def get_or_generate_tabbed_vocab(tmp_dir, source_filename, return vocab # Use Tokenizer to count the word occurrences. + token_counts = defaultdict(int) filepath = os.path.join(tmp_dir, source_filename) with tf.gfile.GFile(filepath, mode="r") as source_file: for line in source_file: @@ -331,11 +332,11 @@ def get_or_generate_tabbed_vocab(tmp_dir, source_filename, if line and "\t" in line: parts = line.split("\t", maxsplit=1) part = parts[index].strip() - _ = tokenizer.encode(text_encoder.native_to_unicode(part)) + for tok in tokenizer.encode(text_encoder.native_to_unicode(part)): + token_counts[tok] += 1 vocab = text_encoder.SubwordTextEncoder.build_to_target_size( - vocab_size, tokenizer.token_counts, 1, - min(1e3, vocab_size + text_encoder.NUM_RESERVED_TOKENS)) + vocab_size, token_counts, 1, 1e3) vocab.store_to_file(vocab_filepath) return vocab diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py old mode 100644 new mode 100755 index b5894904d..66a01487c --- a/tensor2tensor/utils/trainer_utils.py +++ b/tensor2tensor/utils/trainer_utils.py @@ -585,6 +585,7 @@ def decode_from_dataset(estimator): tf.logging.info("Performing local inference.") infer_problems_data = get_datasets_for_mode(hparams.data_dir, tf.contrib.learn.ModeKeys.INFER) + infer_input_fn = get_input_fn( mode=tf.contrib.learn.ModeKeys.INFER, hparams=hparams, @@ -625,9 +626,11 @@ def log_fn(inputs, # The function predict() returns an iterable over the network's # predictions from the test input. We use it to log inputs and decodes. - for j, result in enumerate(result_iter): - inputs, targets, outputs = (result["inputs"], result["targets"], - result["outputs"]) + inputs_iter = result_iter["inputs"] + targets_iter = result_iter["targets"] + outputs_iter = result_iter["outputs"] + for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)): + inputs, targets, outputs = result if FLAGS.decode_return_beams: output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0) for k, beam in enumerate(output_beams):