filter_files_by_extension function

Signed-off-by: Sarah Yurick <sarahyurick@gmail.com>
NVIDIA · Oct 22, 2024 · 64788e5 · 64788e5
1 parent 4ad1a4d
commit 64788e5
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 2 deletions.
diff --git a/nemo_curator/datasets/doc_dataset.py b/nemo_curator/datasets/doc_dataset.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from typing import List, Optional, Union
 
 import dask.dataframe as dd
@@ -196,8 +197,8 @@ def _read_json_or_parquet(
     file_ext = "." + file_type
 
     if isinstance(input_files, list):
-        # List of jsonl or parquet files
-        if all(f.endswith(file_ext) for f in input_files):
+        # List of files
+        if all(os.path.isfile(f) for f in input_files):
             raw_data = read_data(
                 input_files,
                 file_type=file_type,

diff --git a/nemo_curator/utils/distributed_utils.py b/nemo_curator/utils/distributed_utils.py
@@ -356,6 +356,16 @@ def read_pandas_pickle(file, add_filename=False) -> pd.DataFrame:
     return pd.read_pickle(file)
 
 
+def filter_files_by_extension(files_list, file_ext):
+    filtered_files = []
+    for file in files_list:
+        if file.endswith(file_ext):
+            filtered_files.append(file)
+        else:
+            warnings.warn(f"Skipping read for file: {file}")
+    return filtered_files
+
+
 def read_data(
     input_files,
     file_type: str = "pickle",
@@ -391,15 +401,20 @@ def read_data(
             df = df.to_backend("cudf")
 
     elif file_type in ["json", "jsonl", "parquet"]:
+        file_ext = "." + file_type
+        input_files = filter_files_by_extension(input_files, file_ext)
         print(f"Reading {len(input_files)} files", flush=True)
         input_files = sorted(input_files)
+
         if files_per_partition > 1:
             input_files = [
                 input_files[i : i + files_per_partition]
                 for i in range(0, len(input_files), files_per_partition)
             ]
+
         else:
             input_files = [[file] for file in input_files]
+
         return dd.from_map(
             read_single_partition,
             input_files,
@@ -409,8 +424,10 @@ def read_data(
             input_meta=input_meta,
             enforce_metadata=False,
         )
+
     else:
         raise RuntimeError("Could not read data, please check file type")
+
     return df