fix(or): update scraper to solve IndexError

Solves #1200 - Implement backscraper - collect citations - collect dispositions - update example files - collect unpublished opinions for orctapp
freelawproject · Oct 10, 2024 · 8e16ded · 8e16ded
1 parent 865a726
commit 8e16ded
Show file tree

Hide file tree

Showing 8 changed files with 19,351 additions and 6,059 deletions.
diff --git a/juriscraper/opinions/united_states/state/or.py b/juriscraper/opinions/united_states/state/or.py
@@ -4,15 +4,158 @@
  - 2023-11-18: Fixed and updated
 """
 
-from juriscraper.opinions.united_states.state import orctapp
+from datetime import datetime, timedelta
 
+from juriscraper.AbstractSite import logger
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
+
+
+class Site(OpinionSiteLinear):
+    court_code = "p17027coll3"
+    detail_url = "https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{}/identi^{}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
+    download_url = "https://ojd.contentdm.oclc.org/digital/api/collection/{}/id/{}/download"
+    days_interval = 720
+    # Earliest opinion as of development in Oct 2024
+    first_opinion_date = datetime(2023, 4, 1)
 
-class Site(orctapp.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.court_id = self.__module__
         self.url = (
             "https://www.courts.oregon.gov/publications/sc/Pages/default.aspx"
         )
-        self.status = "Published"
-        self.court_code = "p17027coll3"
+        self.make_backscrape_iterable(kwargs)
+
+        # By default, scrape at most 10 days into the past
+        # It's important to limit regular scrapes, since
+        # this scraper makes secondary requests and the site
+        # loads all opinions back to a year; which would
+        # create a lot of hits to the server each time
+        # the hourly scraper is triggered
+        # The limits will be modified in a backscrape
+        self.start_date = (datetime.today() - timedelta(10)).date()
+        self.end_date = (datetime.today() + timedelta(1)).date()
+
+    def _process_html(self):
+        for date_header in self.html.xpath(
+            "//h4[a[contains(@href, '/dated/')]]"
+        ):
+            date_string = date_header.text_content().strip()
+            if not date_string:
+                logger.info("Skipping section with no date string")
+                continue
+
+            date = datetime.strptime(date_string, "%m/%d/%Y").date()
+            if date > self.end_date:
+                # Opinions come in descending date order
+                continue
+            if date < self.start_date and not self.test_mode_enabled():
+                logger.info(
+                    "Date %s is out of range [%s to %s]",
+                    date,
+                    self.start_date,
+                    self.end_date,
+                )
+                break
+
+            self.process_a_date(date_header)
+
+    def process_a_date(self, date_header) -> None:
+        """Process a section defined by a date header
+
+        :param date_header: the lxml element containing the date
+        :return None
+        """
+        date_string = date_header.text_content().strip()
+
+        # orctapp has h5 tags which describe the status of the
+        # opinions in the next ul
+        for sibling in date_header.xpath("following-sibling::*"):
+            if sibling.tag not in ["ul", "h5"]:
+                # Time to jump to another date
+                break
+
+            if "orctapp" in self.court_id:
+                if sibling.tag == "h5":
+                    status = sibling.text_content().strip()
+                    if status == "Precedential Opinions":
+                        status = "Published"
+                    elif status == "Nonprecedential Memorandum Opinions":
+                        status = "Unpublished"
+                    else:
+                        status = "Unknown"
+            else:
+                status = "Published"
+
+            for item in sibling.xpath("li"):
+                # Ensure two links are present (skip Petitions
+                # for Review rows)
+                text = item.text_content().strip()
+                anchors = item.xpath(".//a")
+                if not (len(anchors) > 1):
+                    logger.info("Skipping row without 2 links. Row: %s", text)
+                    continue
+
+                detail_url = anchors[0].xpath("./@href")[0]
+                download_url, disposition = self.get_details(detail_url)
+                if not download_url:
+                    # Usually happens for
+                    # "Miscellaneous Supreme Court Dispositions"
+                    logger.info("No records for detail JSON")
+                    continue
+
+                name = text.split(")", 1)[-1]
+                # Clean up names like:
+                # "Knopp v. Griffin-Valade (Certified appeal accepted)"
+                if "(" in name:
+                    name, disposition = name.split("(", 1)
+                    disposition = disposition.strip(")")
+
+                self.cases.append(
+                    {
+                        "date": date_string,
+                        "name": name,
+                        "docket": anchors[1].text_content().strip(),
+                        "url": download_url,
+                        "citation": item.xpath("b/text()")[0].strip(),
+                        "status": status,
+                        "disposition": disposition,
+                    }
+                )
+
+    def get_details(self, detail_url: str) -> tuple[str, str]:
+        """Makes a request to get a case details, including the URL
+
+        :param detail_url: case detail's page url
+        :return: a tuple: (the pdf download url, the disposition)
+        """
+        if self.test_mode_enabled():
+            return "placeholder url", "placeholder disposition"
+
+        identifier = detail_url.split("=")[-1]
+        detail_url = self.detail_url.format(self.court_code, identifier)
+
+        logger.info("Getting detail JSON from %s", detail_url)
+        json = self.request["session"].get(detail_url).json()
+        logger.debug(json)
+        if not json.get("records"):
+            return "", ""
+
+        disposition = json["records"][0].get("descri") or ""
+        download_url = self.download_url.format(
+            self.court_code, json["records"][0]["pointer"]
+        )
+        return download_url, disposition
+
+    def _download_backwards(self, dates: tuple) -> None:
+        """The site loads by default the last couple years of data.
+        So it's not necessary to query the page in a special way to
+        target data in these years, only to set the proper date limits
+
+        To back scrape older opinions, we would need to target another
+        site
+        """
+        self.start_date, self.end_date = dates
+        logger.info("Backscraping for range %s %s", *dates)
+        self.html = self._download()
+        self._process_html()
diff --git a/juriscraper/opinions/united_states/state/orctapp.py b/juriscraper/opinions/united_states/state/orctapp.py
@@ -7,62 +7,17 @@
     - 2023-11-18: Created
 """
 
-from juriscraper.DeferringList import DeferringList
-from juriscraper.OpinionSiteLinear import OpinionSiteLinear
+from importlib import import_module
 
+# `or` is a python reserved keyword; can't import the module as usual
+oregon_module = import_module("juriscraper.opinions.united_states.state.or")
+
+
+class Site(oregon_module.Site):
+    court_code = "p17027coll5"
 
-class Site(OpinionSiteLinear):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.court_id = self.__module__
         self.url = (
             "https://www.courts.oregon.gov/publications/coa/Pages/default.aspx"
         )
-        self.cases = []
-        self.status = "Published"
-        self.court_code = "p17027coll5"
-
-    def fetch_url_json(self, identifier):
-        """"""
-        url = f"https://ojd.contentdm.oclc.org/digital/bl/dmwebservices/index.php?q=dmQuery/{self.court_code}/identi^{identifier}^all^and/title!subjec!descri!dmrecord/title/1024/1/0/0/0/0/json"
-        json = self.request["session"].get(url).json()
-        return f"https://ojd.contentdm.oclc.org/digital/api/collection/{self.court_code}/id/{json['records'][0]['pointer']}/download"
-
-    def _process_html(self):
-        for header in self.html.xpath("//h4//a/parent::h4"):
-            date_string = header.text_content().strip()
-            if not date_string:
-                continue
-            ul = header.xpath("./following-sibling::ul")[0]
-            for item in ul.xpath(".//li"):
-                # Ensure two links are present (skip Petitions for Review rows)
-                # see or_example_2.html
-                anchors = item.xpath(".//a")
-                if not (len(anchors) > 1):
-                    continue
-                text = item.text_content().strip()
-                url = anchors[0].xpath("./@href")[0]
-                docket = anchors[1].text_content().strip()
-                name = text.split(")", 1)[-1]
-                self.cases.append(
-                    {
-                        "date": date_string,
-                        "name": name,
-                        "docket": docket,
-                        "url": url,
-                    }
-                )
-
-    def _get_download_urls(self):
-        """Get download urls
-
-        :return: List URLs
-        """
-
-        def fetcher(case):
-            if self.test_mode_enabled():
-                return case["url"]
-
-            return self.fetch_url_json(case["url"].split("=")[-1][:-4])
-
-        return DeferringList(seed=self.cases, fetcher=fetcher)