From 5a72e5c9654ea584c9a3be8dfdd80713768ad540 Mon Sep 17 00:00:00 2001
From: vthorsteinsson <vt@extrada.com>
Date: Fri, 14 Jul 2017 14:22:38 +0000
Subject: [PATCH] Adapted to upstream tokenizer change

---
 tensor2tensor/data_generators/generator_utils.py |  9 ++++-----
 tensor2tensor/data_generators/text_encoder.py    | 10 +++++-----
 tensor2tensor/data_generators/tokenizer.py       |  1 +
 3 files changed, 10 insertions(+), 10 deletions(-)
 mode change 100644 => 100755 tensor2tensor/data_generators/tokenizer.py

diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index e3c41f7b7..f076c10da 100755
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -300,9 +300,8 @@ def get_or_generate_tabbed_vocab(tmp_dir, source_filename, index, vocab_filename
     vocab = text_encoder.SubwordTextEncoder(vocab_filepath)
     return vocab
 
-  tokenizer = Tokenizer()
-
   # Use Tokenizer to count the word occurrences.
+  token_counts = defaultdict(int)
   filepath = os.path.join(tmp_dir, source_filename)
   with tf.gfile.GFile(filepath, mode="r") as source_file:
     for line in source_file:
@@ -310,11 +309,11 @@ def get_or_generate_tabbed_vocab(tmp_dir, source_filename, index, vocab_filename
       if line and '\t' in line:
         parts = line.split('\t', maxsplit = 1)
         part = parts[index].strip()
-        _ = tokenizer.encode(text_encoder.native_to_unicode(part))
+        for tok in tokenizer.encode(text_encoder.native_to_unicode(part)):
+          token_counts[tok] += 1
 
   vocab = text_encoder.SubwordTextEncoder.build_to_target_size(
-      vocab_size, tokenizer.token_counts, 1,
-      min(1e3, vocab_size + text_encoder.NUM_RESERVED_TOKENS))
+      vocab_size, token_counts, 1, 1e3)
   vocab.store_to_file(vocab_filepath)
   return vocab
 
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 1c4701c10..38b78256d 100755
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -54,7 +54,7 @@
 PAD_TOKEN = RESERVED_TOKENS.index(PAD) # Normally 0
 EOS_TOKEN = RESERVED_TOKENS.index(EOS) # Normally 1
 
-if six.PY2:
+if PY2:
   RESERVED_TOKENS_BYTES = RESERVED_TOKENS
 else:
   RESERVED_TOKENS_BYTES = [bytes(PAD, "ascii"), bytes(EOS, "ascii")]
@@ -110,7 +110,7 @@ class ByteTextEncoder(TextEncoder):
 
   def encode(self, s):
     numres = self._num_reserved_ids
-    if six.PY2:
+    if PY2:
       return [ord(c) + numres for c in s]
     # Python3: explicitly convert to UTF-8
     return [c + numres for c in s.encode("utf-8")]
@@ -124,7 +124,7 @@ def decode(self, ids):
         decoded_ids.append(RESERVED_TOKENS_BYTES[int(id_)])
       else:
         decoded_ids.append(int2byte(id_ - numres))
-    if six.PY2:
+    if PY2:
       return "".join(decoded_ids)
     # Python3: join byte arrays and then decode string
     return b"".join(decoded_ids).decode("utf-8", "replace")
@@ -469,7 +469,7 @@ def store_to_file(self, filename):
         f.write("'" + unicode_to_native(subtoken_string) + "'\n")
 
   def _escape_token(self, token):
-    r"""Escape away underscores and OOV characters and append '_'.
+    """Escape away underscores and OOV characters and append '_'.
 
     This allows the token to be experessed as the concatenation of a list
     of subtokens from the vocabulary.  The underscore acts as a sentinel
@@ -491,7 +491,7 @@ def _escape_token(self, token):
     return ret
 
   def _unescape_token(self, escaped_token):
-    r"""Inverse of _escape_token().
+    """Inverse of _escape_token().
 
     Args:
       escaped_token: a unicode string
diff --git a/tensor2tensor/data_generators/tokenizer.py b/tensor2tensor/data_generators/tokenizer.py
old mode 100644
new mode 100755
index df6ef6470..65fe19334
--- a/tensor2tensor/data_generators/tokenizer.py
+++ b/tensor2tensor/data_generators/tokenizer.py
@@ -141,6 +141,7 @@ def read_corpus():
           if corpus_max_lines > 0 and lines_read > corpus_max_lines:
             return docs
     return docs
+
   counts = defaultdict(int)
   for doc in read_corpus():
     for tok in encode(_native_to_unicode(doc)):