From aa5f1bd6ce313b720b612cb884fec8dadfe3790c Mon Sep 17 00:00:00 2001
From: Gianfranco Rossi <gianfranco.rossi.r@gmail.com>
Date: Mon, 21 Oct 2024 11:41:47 -0500
Subject: [PATCH] fix(coloctapp): update cleanup_content

Solves: #1215

We are getting duplicate hashes again due to some documents having multiple hash altering elements. Generalize cleanup_content to cases with more than one element
---
 .../opinions/united_states/state/coloctapp.py | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/juriscraper/opinions/united_states/state/coloctapp.py b/juriscraper/opinions/united_states/state/coloctapp.py
index 625a2252e..02676ea4d 100644
--- a/juriscraper/opinions/united_states/state/coloctapp.py
+++ b/juriscraper/opinions/united_states/state/coloctapp.py
@@ -9,6 +9,8 @@
     - 2023-11-19: Updated by William E. Palin
 """
 
+import re
+
 from lxml import html
 
 from juriscraper.opinions.united_states.state import colo
@@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str:
         :return: cleaned up html
         """
         tree = html.fromstring(content)
-        remove_xpaths = ["//style", "//img"]
-        for xpath in remove_xpaths:
-            if tree.xpath(xpath):
-                to_remove = tree.xpath(xpath)[0]
-                to_remove.getparent().remove(to_remove)
-
-        for tag in tree.xpath("//*[@class]"):
-            tag.attrib.pop("class")
+        remove_tags = ["//style", "//img"]
+        remove_attributes = [
+            "//*[@class]",
+            # contains json like data with "ctm" key
+            "//*[@data-data]",
+            # contains coordinate like data
+            "//*[@data-dest-detail]",
+        ]
+        for xpath in remove_tags:
+            for element in tree.xpath(xpath):
+                element.getparent().remove(element)
+
+        for xpath in remove_attributes:
+            attrib = re.search(r"[\w-]+", xpath).group(0)
+            for element in tree.xpath(xpath):
+                element.attrib.pop(attrib)
 
         return html.tostring(
             tree, pretty_print=True, encoding="unicode"