fix(coloctapp): update cleanup_content

Solves: #1215 We are getting duplicate hashes again due to some documents having multiple hash altering elements. Generalize cleanup_content to cases with more than one element
freelawproject · Oct 21, 2024 · 55bceb2 · 55bceb2
1 parent 96acad0
commit 55bceb2
Show file tree

Hide file tree

Showing 4 changed files with 26 additions and 8 deletions.
diff --git a/juriscraper/opinions/united_states/federal_appellate/ca7.py b/juriscraper/opinions/united_states/federal_appellate/ca7.py
@@ -16,6 +16,9 @@ def __init__(self, *args, **kwargs):
     def _process_html(self):
         if self.test_mode_enabled():
             self.year = 2022
+        import code
+
+        code.interact(local=dict(locals(), **globals()))
         feed = feedparser.parse(self.request["response"].content)
         for item in feed["entries"]:
             parts = item["summary"].split()

diff --git a/juriscraper/opinions/united_states/state/coloctapp.py b/juriscraper/opinions/united_states/state/coloctapp.py
@@ -9,6 +9,8 @@
     - 2023-11-19: Updated by William E. Palin
 """
 
+import re
+
 from lxml import html
 
 from juriscraper.opinions.united_states.state import colo
@@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str:
         :return: cleaned up html
         """
         tree = html.fromstring(content)
-        remove_xpaths = ["//style", "//img"]
-        for xpath in remove_xpaths:
-            if tree.xpath(xpath):
-                to_remove = tree.xpath(xpath)[0]
-                to_remove.getparent().remove(to_remove)
-
-        for tag in tree.xpath("//*[@class]"):
-            tag.attrib.pop("class")
+        remove_tags = ["//style", "//img"]
+        remove_attributes = [
+            "//*[@class]",
+            # contains json like data with "ctm" key
+            "//*[@data-data]",
+            # contains coordinate like data
+            "//*[@data-dest-detail]",
+        ]
+        for xpath in remove_tags:
+            for element in tree.xpath(xpath):
+                element.getparent().remove(element)
+
+        for xpath in remove_attributes:
+            attrib = re.search(r"[\w-]+", xpath).group(0)
+            for element in tree.xpath(xpath):
+                element.attrib.pop(attrib)
 
         return html.tostring(
             tree, pretty_print=True, encoding="unicode"

diff --git a/juriscraper/opinions/united_states/state/ohio.py b/juriscraper/opinions/united_states/state/ohio.py
@@ -23,6 +23,10 @@ def __init__(self, *args, **kwargs):
         self.court_index = 0
         self.year = date.today().year
         self.url = "https://www.supremecourtofohio.gov/rod/docs/"
+        self.request["verify"] = False
+        self.request["headers"] = {
+            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
+        }
         self.court_id = self.__module__
 
     def _set_parameters(self) -> None:

diff --git a/tests/local/test_ScraperExampleTest.py b/tests/local/test_ScraperExampleTest.py
@@ -44,6 +44,7 @@ def run_tests_on_module_str(self, module_str: str) -> None:
         cnt = CaseNameTweaker()
         json_compare_extension = ".compare.json"
         json_compare_files_generated = []
+        module_strings = ["juriscraper.opinions.united_states.state.lactapp_5"]
         for module_string in module_strings:
             package, module = module_string.rsplit(".", 1)
             mod = __import__(