Skip to content

Commit

Permalink
regex tool no longer needs to index since column tool exists
Browse files Browse the repository at this point in the history
  • Loading branch information
nickzoic committed Mar 21, 2024
1 parent de934de commit 1be6a33
Showing 1 changed file with 2 additions and 11 deletions.
13 changes: 2 additions & 11 deletions countess/plugins/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ class RegexToolPlugin(PandasTransformSingleToTuplePlugin):
"Column Type",
"string",
),
"index": BooleanParam("Index?"),
},
),
),
Expand Down Expand Up @@ -68,10 +67,6 @@ def process_dataframe(self, dataframe: pd.DataFrame, logger: Logger) -> Optional
except KeyError:
pass

index_names = [pp.name.value for pp in self.parameters["output"] if pp.index.value]
if index_names:
df = df.set_index(index_names)

return df

def process_value(self, value: str, logger: Logger) -> Optional[Iterable]:
Expand Down Expand Up @@ -137,7 +132,6 @@ class RegexReaderPlugin(PandasInputFilesPlugin):
"Column Type",
"string",
),
"index": BooleanParam("Index?", False),
},
),
),
Expand All @@ -153,9 +147,6 @@ def read_file_to_dataframe(self, file_params, logger, row_limit=None):

output_parameters = list(self.parameters["output"])[: compiled_re.groups]
columns = [p.name.value or f"column_{n+1}" for n, p in enumerate(output_parameters)]
index_columns = [
p.name.value or f"column_{n+1}" for n, p in enumerate(output_parameters) if p.index.value
] or None

records = []
with open(file_params["filename"].value, "r", encoding="utf-8") as fh:
Expand All @@ -171,11 +162,11 @@ def read_file_to_dataframe(self, file_params, logger, row_limit=None):
if len(records) >= row_limit or num > 100 * row_limit:
break
elif len(records) >= 100000:
pdfs.append(pd.DataFrame.from_records(records, columns=columns, index=index_columns))
pdfs.append(pd.DataFrame.from_records(records, columns=columns))
records = []

if len(records) > 0:
pdfs.append(pd.DataFrame.from_records(records, columns=columns, index=index_columns))
pdfs.append(pd.DataFrame.from_records(records, columns=columns))

if len(pdfs) == 0:
return pd.DataFrame([], columns=columns)
Expand Down

0 comments on commit 1be6a33

Please sign in to comment.