-
-
Notifications
You must be signed in to change notification settings - Fork 312
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixing Issue #728 #1131
base: main
Are you sure you want to change the base?
Fixing Issue #728 #1131
Changes from all commits
7e53c15
addb736
8098287
482842b
0826eb0
3e89b51
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,126 @@ | ||||||
import re | ||||||
from datetime import datetime | ||||||
from io import BytesIO | ||||||
|
||||||
import requests | ||||||
from city_scrapers_core.constants import BOARD, COMMITTEE | ||||||
from city_scrapers_core.items import Meeting | ||||||
from city_scrapers_core.spiders import CityScrapersSpider | ||||||
from pdfminer.high_level import extract_text | ||||||
|
||||||
|
||||||
class ChiNortheasternIlUniversitySpider(CityScrapersSpider): | ||||||
name = "chi_northeastern_il_university" | ||||||
agency = "Northeastern Illinois University" | ||||||
timezone = "America/Chicago" | ||||||
start_urls = [ | ||||||
"https://www.neiu.edu/about/board-of-trustees/board-meeting-materials" | ||||||
] | ||||||
|
||||||
def parse(self, response): | ||||||
for meeting in response.css("div.board-meeting-materials-row.views-row"): | ||||||
head = meeting.css("h4.accordion::text").get().split() | ||||||
if len(head) >= 3: | ||||||
date = " ".join(head[:3]) | ||||||
title = " ".join(head[3:]) if len(head) > 3 else "" | ||||||
else: | ||||||
date = head | ||||||
title = "" | ||||||
links, agenda = self._parse_links(meeting) | ||||||
details = None | ||||||
if agenda: | ||||||
res = requests.get(agenda) | ||||||
details = extract_text(BytesIO(res.content)) | ||||||
Comment on lines
+32
to
+33
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid using Using Refactor the code to use -import requests
+from scrapy import Request
...
- res = requests.get(agenda)
- details = extract_text(BytesIO(res.content))
+ yield Request(
+ url=agenda,
+ callback=self.parse_agenda,
+ meta={'date': date, 'title': title, 'links': links}
+ )
# Then, define the parse_agenda method to handle the response
+ def parse_agenda(self, response):
+ details = extract_text(BytesIO(response.body))
+ meeting = Meeting(
+ title=self._parse_title(response.meta['title']),
+ description="",
+ classification=self._parse_classification(response.meta['title']),
+ start=self._parse_start(response.meta['date'], details),
+ end=self._parse_end(response.meta['date'], details),
+ all_day=self._parse_all_day(),
+ time_notes="",
+ location=self._parse_location(details),
+ links=response.meta['links'],
+ source=self._parse_source(response),
+ )
+ meeting["status"] = self._get_status(meeting)
+ meeting["id"] = self._get_id(meeting)
+ yield meeting
|
||||||
meeting = Meeting( | ||||||
title=self._parse_title(title), | ||||||
description="", | ||||||
classification=self._parse_classification(title), | ||||||
start=self._parse_start(date, details), | ||||||
end=self._parse_end(date, details), | ||||||
all_day=self._parse_all_day(meeting), | ||||||
time_notes="", | ||||||
location=self._parse_location(details), | ||||||
links=links, | ||||||
source=self._parse_source(response), | ||||||
) | ||||||
|
||||||
meeting["status"] = self._get_status(meeting) | ||||||
meeting["id"] = self._get_id(meeting) | ||||||
|
||||||
yield meeting | ||||||
|
||||||
def getMeetingDetails(self, response): | ||||||
print(response.text) | ||||||
|
||||||
def _parse_title(self, item): | ||||||
return item if not item == "" else "BOARD MEETING" | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Simplify comparison by using Instead of using Apply this diff to simplify the condition: def _parse_title(self, item):
- return item if not item == "" else "BOARD MEETING"
+ return item if item != "" else "BOARD MEETING" 📝 Committable suggestion
Suggested change
🧰 Tools🪛 Ruff
|
||||||
|
||||||
def _parse_description(self, item): | ||||||
return "" | ||||||
|
||||||
def _parse_classification(self, item): | ||||||
return COMMITTEE if "COMMITTEE" in item else BOARD | ||||||
|
||||||
def _parse_start(self, date, parse): | ||||||
p = re.compile( | ||||||
r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE | ||||||
) | ||||||
replacementPattern = re.compile("[^0-9:].*") | ||||||
time = re.search(p, parse).group(0) | ||||||
midDay = re.search(replacementPattern, time).group(0) | ||||||
trueTime = ( | ||||||
time.replace(midDay, " AM").strip() | ||||||
if "a" in midDay | ||||||
else time.replace(midDay, " PM").strip() | ||||||
) | ||||||
fullDate = date + " " + trueTime | ||||||
return datetime.strptime(fullDate, "%B %d, %Y %I:%M %p") | ||||||
Comment on lines
+69
to
+77
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add error handling for regex search in In the Add a check to ensure def _parse_start(self, date, parse):
+ if not parse:
+ # Handle the case where parse is None or empty
+ return None
p = re.compile(
r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE
)
replacementPattern = re.compile("[^0-9:].*")
+ search_result = re.search(p, parse)
+ if not search_result:
+ # Handle the absence of a matching time string
+ return None
- time = re.search(p, parse).group(0)
+ time = search_result.group(0)
|
||||||
|
||||||
def _parse_end(self, date, parse): | ||||||
pattern = re.compile( | ||||||
r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE | ||||||
) | ||||||
replacementPattern = re.compile("[^0-9:].*") | ||||||
time = re.findall(pattern, parse)[-1] | ||||||
midDay = re.search(replacementPattern, time).group(0) | ||||||
trueTime = ( | ||||||
time.replace(midDay, " AM").strip() | ||||||
if "a" in midDay | ||||||
else time.replace(midDay, " PM").strip() | ||||||
) | ||||||
fullDate = date + " " + trueTime | ||||||
return datetime.strptime(fullDate, "%B %d, %Y %I:%M %p") | ||||||
Comment on lines
+84
to
+92
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add error handling for empty regex results in In the Ensure that def _parse_end(self, date, parse):
+ if not parse:
+ # Handle the case where parse is None or empty
+ return None
pattern = re.compile(
r"\d{1,2}:\d{1,2}.[a-z]{0,1}\.{0,1}[a-z]{0,1}\.{0,1}", re.MULTILINE
)
replacementPattern = re.compile("[^0-9:].*")
times = re.findall(pattern, parse)
+ if not times:
+ # Handle the absence of time strings
+ return None
- time = times[-1]
+ time = times[-1]
|
||||||
|
||||||
def _parse_time_notes(self, item): | ||||||
return "" | ||||||
|
||||||
def _parse_all_day(self, item): | ||||||
return False | ||||||
|
||||||
def _parse_location(self, item): | ||||||
pattern = re.compile(r"(\d\d\d\d.*\n?)(?=\s*Meeting)", re.MULTILINE) | ||||||
match = re.search(pattern, item) | ||||||
location = match.group(1).strip().split("|") | ||||||
return { | ||||||
"address": location[0].strip() + ", " + location[1].strip(), | ||||||
"name": location[2].strip(), | ||||||
} | ||||||
Comment on lines
+101
to
+107
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add error handling for regex search in In Include a check to handle cases where def _parse_location(self, item):
pattern = re.compile(r"(\d\d\d\d.*\n?)(?=\s*Meeting)", re.MULTILINE)
match = re.search(pattern, item)
+ if not match:
+ # Handle the absence of a matching location string
+ return {"address": "", "name": ""}
location = match.group(1).strip().split("|")
|
||||||
|
||||||
def _parse_links(self, item): | ||||||
links = [] | ||||||
agenda = None | ||||||
for link in item.css("a"): | ||||||
href = link.attrib["href"] | ||||||
title = link.xpath("./text()").extract_first(default="") | ||||||
if "agenda" in title.lower(): | ||||||
agenda = href | ||||||
links.append( | ||||||
{ | ||||||
"href": href, | ||||||
"title": title, | ||||||
} | ||||||
) | ||||||
return links, agenda | ||||||
|
||||||
def _parse_source(self, response): | ||||||
return response.url |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Improve date and title parsing logic
The current logic for parsing
date
andtitle
fromhead
may fail if the format ofhead
changes or doesn't meet expectations. This can lead to incorrect data extraction.Consider using more robust parsing by checking the content of
head
:📝 Committable suggestion