diff --git a/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_filter_transform_base.py b/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_filter_transform_base.py index 47124db4a..1e13ac57b 100644 --- a/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_filter_transform_base.py +++ b/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_filter_transform_base.py @@ -73,17 +73,19 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab ids = table.column(self.doc_id_column) unique = self._get_unique_ids(ids.to_pylist()) # Filter out table - mask = [] + mask = [False] * table.num_rows clusters = [] removed = [] # Actual filtering for n in range(table.num_rows): doc_id = ids[n].as_py() + if not isinstance(doc_id, int): + self.logger.error(f"table content is wrong type doc_id {doc_id}, skipping the row") + continue if doc_id in unique: - mask.append(True) + mask[n] = True clusters.append(unique.pop(doc_id)) else: - mask.append(False) removed.append(doc_id) # build out table out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters) diff --git a/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_preprocessor_transform_base.py b/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_preprocessor_transform_base.py index ad128c498..6d6409d5d 100644 --- a/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_preprocessor_transform_base.py +++ b/transforms/universal/fdedup_multi_step/python/src/fdedup/transforms/base/fdedup_preprocessor_transform_base.py @@ -196,6 +196,9 @@ def _flush_to_cache(limit: int) -> None: for n in range(table.num_rows): doc = docs[n].as_py() doc_id = doc_ids[n].as_py() + if not isinstance(doc_id, int) or not isinstance(doc, str): + self.logger.error(f"table content is wrong type doc - {doc}, id {doc_id}, skipping the row") + continue shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter) if len(shingles) > 0: mh = self._generate_minhashes(shingles)