Skip to content

Commit

Permalink
Merge pull request #1216 from freelawproject/clean_coloctapp_content
Browse files Browse the repository at this point in the history
fix(coloctapp): update cleanup_content
  • Loading branch information
flooie authored Oct 21, 2024
2 parents 9fafceb + 58b43d3 commit 1d47cc6
Showing 1 changed file with 18 additions and 8 deletions.
26 changes: 18 additions & 8 deletions juriscraper/opinions/united_states/state/coloctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
- 2023-11-19: Updated by William E. Palin
"""

import re

from lxml import html

from juriscraper.opinions.united_states.state import colo
Expand All @@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str:
:return: cleaned up html
"""
tree = html.fromstring(content)
remove_xpaths = ["//style", "//img"]
for xpath in remove_xpaths:
if tree.xpath(xpath):
to_remove = tree.xpath(xpath)[0]
to_remove.getparent().remove(to_remove)

for tag in tree.xpath("//*[@class]"):
tag.attrib.pop("class")
remove_tags = ["//style", "//img"]
remove_attributes = [
"//*[@class]",
# contains json like data with "ctm" key
"//*[@data-data]",
# contains coordinate like data
"//*[@data-dest-detail]",
]
for xpath in remove_tags:
for element in tree.xpath(xpath):
element.getparent().remove(element)

for xpath in remove_attributes:
attrib = re.search(r"[\w-]+", xpath).group(0)
for element in tree.xpath(xpath):
element.attrib.pop(attrib)

return html.tostring(
tree, pretty_print=True, encoding="unicode"
Expand Down

0 comments on commit 1d47cc6

Please sign in to comment.