From 72c444b8b05402f1a51ce04da66a3b29945c437b Mon Sep 17 00:00:00 2001 From: Sarah Yurick Date: Wed, 23 Oct 2024 12:22:59 -0700 Subject: [PATCH] run black Signed-off-by: Sarah Yurick --- nemo_curator/utils/text_utils.py | 1 + tests/test_filters.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/nemo_curator/utils/text_utils.py b/nemo_curator/utils/text_utils.py index c8100bc6..069849f0 100644 --- a/nemo_curator/utils/text_utils.py +++ b/nemo_curator/utils/text_utils.py @@ -45,6 +45,7 @@ def mecab_splitter(text): return mecab_splitter else: + def default_splitter(text): return text.split() diff --git a/tests/test_filters.py b/tests/test_filters.py index c46e6734..66bc7dc0 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -431,9 +431,7 @@ def test_wordcount(self): ), f"Expected {expected_data} but got {filtered_data}" def test_wordcount_zh(self): - dataset = list_to_dataset( - ["", "你好。", "我喜欢学习中文。"] - ) + dataset = list_to_dataset(["", "你好。", "我喜欢学习中文。"]) filters = ScoreFilter(WordCountFilter(min_words=2, max_words=5, lang="zh")) filtered_data = filters(dataset)