From fc9335c0203685cbbfe2b30c92db4352d8f60779 Mon Sep 17 00:00:00 2001 From: Lukasz Kaiser Date: Thu, 5 Apr 2018 10:15:52 -0700 Subject: [PATCH] Add forgotten unicode punctuation normalization to get_ende_bleu. PiperOrigin-RevId: 191758943 --- tensor2tensor/utils/get_ende_bleu.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensor2tensor/utils/get_ende_bleu.sh b/tensor2tensor/utils/get_ende_bleu.sh index 805347231..e48fad36d 100755 --- a/tensor2tensor/utils/get_ende_bleu.sh +++ b/tensor2tensor/utils/get_ende_bleu.sh @@ -5,8 +5,11 @@ tok_gold_targets=newstest2013.tok.de decodes_file=$1 +# Replace unicode. +perl $mosesdecoder/scripts/tokenizer/replace-unicode-punctuation.perl -l de < $decodes_file > $decodes_file.n + # Tokenize. -perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file > $decodes_file.tok +perl $mosesdecoder/scripts/tokenizer/tokenizer.perl -l de < $decodes_file.n > $decodes_file.tok # Put compounds in ATAT format (comparable to papers like GNMT, ConvS2S). # See https://nlp.stanford.edu/projects/nmt/ :