Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing Issue #728 #1131

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions city_scrapers/spiders/chi_northeastern_il_university.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
import re
from datetime import datetime
from io import BytesIO

import requests
from city_scrapers_core.constants import BOARD, COMMITTEE
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from pdfminer.high_level import extract_text


class ChiNortheasternIlUniversitySpider(CityScrapersSpider):
name = "chi_northeastern_il_university"
agency = "Northeastern Illinois University"
timezone = "America/Chicago"
start_urls = [
"https://www.neiu.edu/about/board-of-trustees/board-meeting-materials"
]

def parse(self, response):
for meeting in response.css("div.board-meeting-materials-row.views-row"):
head = meeting.css("h4.accordion::text").get().split()
if len(head) >= 3:
date = " ".join(head[:3])
title = " ".join(head[3:]) if len(head) > 3 else ""
else:
date = head
title = ""
Comment on lines +22 to +28
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve date and title parsing logic

The current logic for parsing date and title from head may fail if the format of head changes or doesn't meet expectations. This can lead to incorrect data extraction.

Consider using more robust parsing by checking the content of head:

 def parse(self, response):
     for meeting in response.css("div.board-meeting-materials-row.views-row"):
         head = meeting.css("h4.accordion::text").get().split()
-        if len(head) >= 3:
-            date = " ".join(head[:3])
-            title = " ".join(head[3:]) if len(head) > 3 else ""
-        else:
-            date = head
-            title = ""
+        date_parts = []
+        title_parts = []
+        for part in head:
+            if re.match(r'\w+ \d{1,2}, \d{4}', ' '.join(date_parts + [part])):
+                date_parts.append(part)
+            else:
+                title_parts.append(part)
+        date = ' '.join(date_parts)
+        title = ' '.join(title_parts) if title_parts else ""
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
head = meeting.css("h4.accordion::text").get().split()
if len(head) >= 3:
date = " ".join(head[:3])
title = " ".join(head[3:]) if len(head) > 3 else ""
else:
date = head
title = ""
head = meeting.css("h4.accordion::text").get().split()
date_parts = []
title_parts = []
for part in head:
if re.match(r'\w+ \d{1,2}, \d{4}', ' '.join(date_parts + [part])):
date_parts.append(part)
else:
title_parts.append(part)
date = ' '.join(date_parts)
title = ' '.join(title_parts) if title_parts else ""

links, agenda = self._parse_links(meeting)
details = None
if agenda:
res = requests.get(agenda)
details = extract_text(BytesIO(res.content))
Comment on lines +32 to +33
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Avoid using requests within Scrapy spiders

Using requests.get at lines 32-33 is not recommended within Scrapy spiders, as it bypasses Scrapy's built-in features like asynchronous request handling, caching, and adherence to robots.txt. Instead, utilize Scrapy's Request or response.follow methods to make HTTP requests.

Refactor the code to use scrapy.Request and handle the agenda PDF download asynchronously. Here's how you can modify the code:

-import requests
+from scrapy import Request

...

-            res = requests.get(agenda)
-            details = extract_text(BytesIO(res.content))
+            yield Request(
+                url=agenda,
+                callback=self.parse_agenda,
+                meta={'date': date, 'title': title, 'links': links}
+            )

# Then, define the parse_agenda method to handle the response
+    def parse_agenda(self, response):
+        details = extract_text(BytesIO(response.body))
+        meeting = Meeting(
+            title=self._parse_title(response.meta['title']),
+            description="",
+            classification=self._parse_classification(response.meta['title']),
+            start=self._parse_start(response.meta['date'], details),
+            end=self._parse_end(response.meta['date'], details),
+            all_day=self._parse_all_day(),
+            time_notes="",
+            location=self._parse_location(details),
+            links=response.meta['links'],
+            source=self._parse_source(response),
+        )
+        meeting["status"] = self._get_status(meeting)
+        meeting["id"] = self._get_id(meeting)
+        yield meeting

Committable suggestion was skipped due to low confidence.

meeting = Meeting(
title=self._parse_title(title),
description="",
classification=self._parse_classification(title),
start=self._parse_start(date, details),
end=self._parse_end(date, details),
all_day=self._parse_all_day(meeting),
time_notes="",
location=self._parse_location(details),
links=links,
source=self._parse_source(response),
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def getMeetingDetails(self, response):
print(response.text)

def _parse_title(self, item):
return item if not item == "" else "BOARD MEETING"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Simplify comparison by using item != ""

Instead of using not item == "", it's more Pythonic to use item != "".

Apply this diff to simplify the condition:

 def _parse_title(self, item):
-    return item if not item == "" else "BOARD MEETING"
+    return item if item != "" else "BOARD MEETING"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
return item if not item == "" else "BOARD MEETING"
return item if item != "" else "BOARD MEETING"
🧰 Tools
🪛 Ruff

56-56: Use item != "" instead of not item == ""

Replace with != operator

(SIM201)


def _parse_description(self, item):
return ""

def _parse_classification(self, item):
return COMMITTEE if "COMMITTEE" in item else BOARD

def _parse_start(self, date, parse):
p = re.compile(
r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE
)
replacementPattern = re.compile("[^0-9:].*")
time = re.search(p, parse).group(0)
midDay = re.search(replacementPattern, time).group(0)
trueTime = (
time.replace(midDay, " AM").strip()
if "a" in midDay
else time.replace(midDay, " PM").strip()
)
fullDate = date + " " + trueTime
return datetime.strptime(fullDate, "%B %d, %Y %I:%M %p")
Comment on lines +69 to +77
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for regex search in _parse_start

In the _parse_start method, if parse is None or the regular expression doesn't find a match, re.search(p, parse) will return None, leading to an AttributeError when calling .group(0). This can cause the spider to crash.

Add a check to ensure parse is not None and that the regex search finds a match before proceeding:

 def _parse_start(self, date, parse):
+    if not parse:
+        # Handle the case where parse is None or empty
+        return None
     p = re.compile(
         r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE
     )
     replacementPattern = re.compile("[^0-9:].*")
+    search_result = re.search(p, parse)
+    if not search_result:
+        # Handle the absence of a matching time string
+        return None
-    time = re.search(p, parse).group(0)
+    time = search_result.group(0)

Committable suggestion was skipped due to low confidence.


def _parse_end(self, date, parse):
pattern = re.compile(
r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE
)
replacementPattern = re.compile("[^0-9:].*")
time = re.findall(pattern, parse)[-1]
midDay = re.search(replacementPattern, time).group(0)
trueTime = (
time.replace(midDay, " AM").strip()
if "a" in midDay
else time.replace(midDay, " PM").strip()
)
fullDate = date + " " + trueTime
return datetime.strptime(fullDate, "%B %d, %Y %I:%M %p")
Comment on lines +84 to +92
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for empty regex results in _parse_end

In the _parse_end method, if parse is None or the regex search doesn't find any matches, accessing [-1] on the result of re.findall will raise an IndexError.

Ensure that parse is not None and that re.findall returns a non-empty list before proceeding:

 def _parse_end(self, date, parse):
+    if not parse:
+        # Handle the case where parse is None or empty
+        return None
     pattern = re.compile(
         r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE
     )
     replacementPattern = re.compile("[^0-9:].*")
     times = re.findall(pattern, parse)
+    if not times:
+        # Handle the absence of time strings
+        return None
-    time = times[-1]
+    time = times[-1]

Committable suggestion was skipped due to low confidence.


def _parse_time_notes(self, item):
return ""

def _parse_all_day(self, item):
return False

def _parse_location(self, item):
pattern = re.compile(r"(\d\d\d\d.*\n?)(?=\s*Meeting)", re.MULTILINE)
match = re.search(pattern, item)
location = match.group(1).strip().split("|")
return {
"address": location[0].strip() + ", " + location[1].strip(),
"name": location[2].strip(),
}
Comment on lines +101 to +107
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add error handling for regex search in _parse_location

In _parse_location, re.search(pattern, item) may return None if no match is found. Attempting to access match.group(1) when match is None will raise an AttributeError.

Include a check to handle cases where match is None:

 def _parse_location(self, item):
     pattern = re.compile(r"(\d\d\d\d.*\n?)(?=\s*Meeting)", re.MULTILINE)
     match = re.search(pattern, item)
+    if not match:
+        # Handle the absence of a matching location string
+        return {"address": "", "name": ""}
     location = match.group(1).strip().split("|")

Committable suggestion was skipped due to low confidence.


def _parse_links(self, item):
links = []
agenda = None
for link in item.css("a"):
href = link.attrib["href"]
title = link.xpath("./text()").extract_first(default="")
if "agenda" in title.lower():
agenda = href
links.append(
{
"href": href,
"title": title,
}
)
return links, agenda

def _parse_source(self, response):
return response.url
Loading