Skip to content

Commit

Permalink
fix(coloctapp): update cleanup_content
Browse files Browse the repository at this point in the history
Solves:
#1215

We are getting duplicate hashes again due to some documents
having multiple hash altering elements. Generalize cleanup_content to cases with more than one element
  • Loading branch information
grossir committed Oct 21, 2024
1 parent 96acad0 commit 55bceb2
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 8 deletions.
3 changes: 3 additions & 0 deletions juriscraper/opinions/united_states/federal_appellate/ca7.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ def __init__(self, *args, **kwargs):
def _process_html(self):
if self.test_mode_enabled():
self.year = 2022
import code

code.interact(local=dict(locals(), **globals()))
feed = feedparser.parse(self.request["response"].content)
for item in feed["entries"]:
parts = item["summary"].split()
Expand Down
26 changes: 18 additions & 8 deletions juriscraper/opinions/united_states/state/coloctapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
- 2023-11-19: Updated by William E. Palin
"""

import re

from lxml import html

from juriscraper.opinions.united_states.state import colo
Expand All @@ -35,14 +37,22 @@ def cleanup_content(content: str) -> str:
:return: cleaned up html
"""
tree = html.fromstring(content)
remove_xpaths = ["//style", "//img"]
for xpath in remove_xpaths:
if tree.xpath(xpath):
to_remove = tree.xpath(xpath)[0]
to_remove.getparent().remove(to_remove)

for tag in tree.xpath("//*[@class]"):
tag.attrib.pop("class")
remove_tags = ["//style", "//img"]
remove_attributes = [
"//*[@class]",
# contains json like data with "ctm" key
"//*[@data-data]",
# contains coordinate like data
"//*[@data-dest-detail]",
]
for xpath in remove_tags:
for element in tree.xpath(xpath):
element.getparent().remove(element)

for xpath in remove_attributes:
attrib = re.search(r"[\w-]+", xpath).group(0)
for element in tree.xpath(xpath):
element.attrib.pop(attrib)

return html.tostring(
tree, pretty_print=True, encoding="unicode"
Expand Down
4 changes: 4 additions & 0 deletions juriscraper/opinions/united_states/state/ohio.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ def __init__(self, *args, **kwargs):
self.court_index = 0
self.year = date.today().year
self.url = "https://www.supremecourtofohio.gov/rod/docs/"
self.request["verify"] = False
self.request["headers"] = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36"
}
self.court_id = self.__module__

def _set_parameters(self) -> None:
Expand Down
1 change: 1 addition & 0 deletions tests/local/test_ScraperExampleTest.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def run_tests_on_module_str(self, module_str: str) -> None:
cnt = CaseNameTweaker()
json_compare_extension = ".compare.json"
json_compare_files_generated = []
module_strings = ["juriscraper.opinions.united_states.state.lactapp_5"]
for module_string in module_strings:
package, module = module_string.rsplit(".", 1)
mod = __import__(
Expand Down

0 comments on commit 55bceb2

Please sign in to comment.