Skip to content

Commit

Permalink
improved error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
blublinsky committed Sep 18, 2024
1 parent 65f4d70 commit 32c2ad3
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,19 @@ def transform(self, table: pa.Table, file_name: str = None) -> tuple[list[pa.Tab
ids = table.column(self.doc_id_column)
unique = self._get_unique_ids(ids.to_pylist())
# Filter out table
mask = []
mask = [False] * table.num_rows
clusters = []
removed = []
# Actual filtering
for n in range(table.num_rows):
doc_id = ids[n].as_py()
if not isinstance(doc_id, int):
self.logger.error(f"table content is wrong type doc_id {doc_id}, skipping the row")
continue
if doc_id in unique:
mask.append(True)
mask[n] = True
clusters.append(unique.pop(doc_id))
else:
mask.append(False)
removed.append(doc_id)
# build out table
out_table = TransformUtils.add_column(table=table.filter(mask), name=self.cluster_column, content=clusters)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,9 @@ def _flush_to_cache(limit: int) -> None:
for n in range(table.num_rows):
doc = docs[n].as_py()
doc_id = doc_ids[n].as_py()
if not isinstance(doc_id, int) or not isinstance(doc, str):
self.logger.error(f"table content is wrong type doc - {doc}, id {doc_id}, skipping the row")
continue
shingles = compute_shingles(txt=doc, word_shingle_size=self.word_shingle_size, delimiter=self.delimiter)
if len(shingles) > 0:
mh = self._generate_minhashes(shingles)
Expand Down

0 comments on commit 32c2ad3

Please sign in to comment.