From aa5f1bd6ce313b720b612cb884fec8dadfe3790c Mon Sep 17 00:00:00 2001 From: Gianfranco Rossi Date: Mon, 21 Oct 2024 11:41:47 -0500 Subject: [PATCH] fix(coloctapp): update cleanup_content Solves: #1215 We are getting duplicate hashes again due to some documents having multiple hash altering elements. Generalize cleanup_content to cases with more than one element --- .../opinions/united_states/state/coloctapp.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/juriscraper/opinions/united_states/state/coloctapp.py b/juriscraper/opinions/united_states/state/coloctapp.py index 625a2252e..02676ea4d 100644 --- a/juriscraper/opinions/united_states/state/coloctapp.py +++ b/juriscraper/opinions/united_states/state/coloctapp.py @@ -9,6 +9,8 @@ - 2023-11-19: Updated by William E. Palin """ +import re + from lxml import html from juriscraper.opinions.united_states.state import colo @@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str: :return: cleaned up html """ tree = html.fromstring(content) - remove_xpaths = ["//style", "//img"] - for xpath in remove_xpaths: - if tree.xpath(xpath): - to_remove = tree.xpath(xpath)[0] - to_remove.getparent().remove(to_remove) - - for tag in tree.xpath("//*[@class]"): - tag.attrib.pop("class") + remove_tags = ["//style", "//img"] + remove_attributes = [ + "//*[@class]", + # contains json like data with "ctm" key + "//*[@data-data]", + # contains coordinate like data + "//*[@data-dest-detail]", + ] + for xpath in remove_tags: + for element in tree.xpath(xpath): + element.getparent().remove(element) + + for xpath in remove_attributes: + attrib = re.search(r"[\w-]+", xpath).group(0) + for element in tree.xpath(xpath): + element.attrib.pop(attrib) return html.tostring( tree, pretty_print=True, encoding="unicode"