Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

15 fix cle design review #34

Merged
merged 12 commits into from
Dec 27, 2021
253 changes: 188 additions & 65 deletions city_scrapers/spiders/cle_design_review.py
Original file line number Diff line number Diff line change
@@ -1,103 +1,186 @@
import re
from datetime import datetime
import time
from datetime import datetime, timedelta

from city_scrapers_core.constants import ADVISORY_COMMITTEE
from city_scrapers_core.items import Meeting
from city_scrapers_core.spiders import CityScrapersSpider
from scrapy import Selector

from city_scrapers.utils import calculate_upcoming_meeting_days


class CleDesignReviewSpider(CityScrapersSpider):
name = "cle_design_review"
agency = "Cleveland Design Review Advisory Committees"
timezone = "America/Detroit"
start_urls = [
"http://clevelandohio.gov/CityofCleveland/Home/Government/CityAgencies/CityPlanningCommission/MeetingSchedules" # noqa
"https://planning.clevelandohio.gov/designreview/schedule.php" # noqa
]
description = "Due to Covid meetings are being held on WebEx rather than in person. For more information contact " # noqa
calculated_description = "This is an upcoming meeting - please verify it with staff if you want attend. Due to Covid meetings are being held on WebEx rather than in person. For more information contact " # noqa

def parse(self, response):
"""
`parse` should always `yield` Meeting items.
There's no element that wraps both the committee name/time and
the dropdown containing the agendas. As such we want to grab
each committee name/times and then use the following dropdown
to get the agendas. Luckily all of the committee name/times are
(and are the only thing in) divs with the class '.mt-3' so we can
grab all the divs with those classes and then look for the next sibling
div with the ".dropdown" class to get the links to all the agendas.

Note that the city planning meeting is handled by a different scraper so
we do look at it here. Luckily the name/times for the city planning
meeting are not currently wrapped in a div, so the list of nodes described
above won't include it.

There are three other points to keep in mind for this scraper:

1. The way the data is presented doesn't make it easy to know whether or
not a meeting occurred but doesn't have an agenda, or whether a meeting
is going to happen on a normal meeting date. The strategy I'm using is
to treat the agenda links as authoritative for past (and if listed
upcoming) meetings. So previous meetings are just read off of the agenda
links. For future meetings we take the date of the most recent agenda
and then calculate meetings for 60 days from that date. As dates
progress and agendas are added, those tentative meetings will either be
confirmed to exist or disappear based on the ways the agendas are
updated. For calculated meetings we add a line to the description
encouraging users to verify the meeting with staff before attempting to
attend.

2. There is no mention of the year anywhere in the text of the site. We
can extract it from the agenda link - at least for now. But it will
be important to keep an eye on how the site is changed in January.

Change the `_parse_title`, `_parse_start`, etc methods to fit your scraping
needs.
3. Meetings are currently not being held in person but over webex. We've
included this information in the meeting description.
"""
page_content = response.css("#content .field-items .field-item")[0]
bold_text = " ".join(page_content.css("strong *::text").extract())
year_match = re.search(r"\d{4}(?= Agenda)", bold_text)
if year_match:
year_str = year_match.group()
else:
year_str = str(datetime.now().year)
design_review_committees = re.split(r"\<hr.*?\>", page_content.extract())[1:]
for committee in design_review_committees:
committee_item = Selector(text=committee)
title = self._parse_title(committee_item)
committee_metas = response.css(
"div.mt-3"
) # this skips city planning since it is handled by a separate scraper
committee_agendas = response.css("div.mt-3 + div.dropdown")
if len(committee_metas) != len(committee_agendas):
# we haven't sucessfully extracted matched metas and agendas so we
# can't safely iterate over them together.
raise ValueError("Cannot match committee agandas to committee metadata")
committee_items = zip(committee_metas, committee_agendas)

for committee_meta, commitee_agenda_list in committee_items:
title = self._parse_title(committee_meta)
if not title:
continue
location = self._parse_location(committee_item)
time_str = self._parse_time_str(committee_item)
for row in committee_item.css(".report tr"):
month_str = (
row.css("td:first-child::text").extract_first().replace(".", "")
location = self._parse_location(committee_meta)
time_str = self._parse_time_str(committee_meta)
email_contact = self._parse_email_contact(committee_meta)
weekday, chosen_ordinals, is_downtown = self._parse_meeting_schedule_info(
committee_meta
)
most_recent_start = datetime.today()

# Start by looking through the agendas for existing meetings
for agenda in commitee_agenda_list.css("div.dropdown-menu a.dropdown-item"):
month_str, day_str = (
agenda.css("*::text").extract_first().strip().split(" ")
)
year_str = self._parse_year_from_agenda_link(agenda)

start = self._parse_start(year_str, month_str, day_str, time_str)
# most_recent_start will be used to calculate upcoming meetings
# with no agenda
most_recent_start = start
if not start:
continue
meeting = Meeting(
title=title,
description=self.description + email_contact,
classification=ADVISORY_COMMITTEE,
start=start,
end=None,
all_day=False,
time_notes="",
location=location,
links=self._parse_links(agenda, response),
source=response.url,
)
for date_cell in row.css("td:not(:first-child)"):
start = self._parse_start(date_cell, year_str, month_str, time_str)
if not start:
continue
meeting = Meeting(
title=title,
description="",
classification=ADVISORY_COMMITTEE,
start=start,
end=None,
all_day=False,
time_notes="",
location=location,
links=self._parse_links(date_cell, response),
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

# next we calculate upcoming meeting dates for 60 days after the
# last agenda date
calc_start = most_recent_start + timedelta(days=1)
# since downtown meetings are calculated based on the city planning
# meeting one day ahead, we need to add an extra day to avoid
if is_downtown:
calc_start = calc_start + timedelta(days=1)

calc_end = calc_start + timedelta(days=60)

upcoming_meetings = calculate_upcoming_meeting_days(
weekday, chosen_ordinals, calc_start.date(), calc_end.date()
)
if is_downtown: # downtown meetings are a day before the one calculated
upcoming_meetings = [
day + timedelta(days=-1) for day in upcoming_meetings
]

for day in upcoming_meetings:
start = self._parse_calculated_start(day, time_str)
meeting = Meeting(
title=title,
description=self.calculated_description + email_contact,
classification=ADVISORY_COMMITTEE,
start=start,
end=None,
all_day=False,
time_notes="",
location=location,
links=[],
source=response.url,
)

meeting["status"] = self._get_status(meeting)
meeting["id"] = self._get_id(meeting)

yield meeting

def _parse_title(self, item):
"""Parse or generate meeting title."""
committee_strs = [
c.strip()
for c in item.css("p > strong::text").extract()
for c in item.css("h4::text").extract()
if c.strip().upper().endswith("DESIGN REVIEW COMMITTEE")
]
if len(committee_strs):
return committee_strs[0].title()

def _parse_time_str(self, item):
desc_text = " ".join(item.css("p *::text").extract())
"""Parse out the time as a string in the format hh:mm:am/pm"""
desc_text = " ".join(item.css("p.mb-1::text").extract())
time_match = re.search(r"\d{1,2}:\d{2}\s*[apm]{2}", desc_text)
if time_match:
return time_match.group().replace(" ", "")
return "12:00am"

def _parse_start(self, item, year_str, month_str, time_str):
def _parse_start(self, year_str, month_str, day_str, time_str):
"""Parse start datetime as a naive datetime object."""
cell_text = " ".join(item.css("* ::text").extract())
date_text = re.sub(r"\D", "", cell_text)
if not date_text or "No meeting" in cell_text:
return
date_str = " ".join([year_str, month_str, date_text, time_str])
return datetime.strptime(date_str, "%Y %b %d %I:%M%p")
date_str = " ".join([year_str, month_str, day_str, time_str])
return datetime.strptime(date_str, "%Y %B %d %I:%M%p")

def _parse_calculated_start(self, day, time_str):
"""Parse start datetime from python date and a string with the time."""
date_str = " ".join([day.strftime("%Y %B %d"), time_str])
return datetime.strptime(date_str, "%Y %B %d %I:%M%p")

def _parse_location(self, item):
"""Parse or generate location."""
desc_str = " ".join(item.css("p[id] *::text").extract())
# Override for first committee
if "CITYWIDE" in desc_str:
desc_str = " ".join(
[l for l in item.css("p *::text").extract() if "days" in l]
)
desc_str = " ".join(item.css("p.mb-1::text").extract())
loc_str = re.sub(r"\s+", " ", re.split(r"(\sin\s|\sat\s)", desc_str)[-1])
# The downtown/flats commission doesn't give the full address - it just says
# city hall so we need a special case to add the street address
if "City Hall" in loc_str:
loc_name = "City Hall"
room_match = re.search(r"(?<=Room )\d+", loc_str)
Expand All @@ -111,6 +194,7 @@ def _parse_location(self, item):
split_loc = loc_str.split("-")
loc_name = "-".join(split_loc[:-1])
loc_addr = split_loc[-1]
# We need to make sure that the address ends with the city and state
if "Cleveland" not in loc_addr:
loc_addr = loc_addr.strip() + " Cleveland, OH"
return {
Expand All @@ -119,12 +203,51 @@ def _parse_location(self, item):
}

def _parse_links(self, item, response):
"""Parse out the links for the meeting"""
links = []
for link in item.css("a"):
links.append(
{
"title": " ".join(link.css("*::text").extract()).strip(),
"href": response.urljoin(link.attrib["href"]),
}
)
links.append({"title": "Agenda", "href": response.urljoin(item.attrib["href"])})
return links

def _parse_year_from_agenda_link(self, item):
"""Parse the year as a string from a link containing the agenda"""
link = item.attrib["href"]
year_match = re.search(r"\/(20\d{2})\/", link)
if year_match:
return year_match.group(1)
return "2021"

def _parse_email_contact(self, item):
"""Parses the email for a committee's contact"""
email_str = item.css("p.mt-1::text").extract()[2]
return email_str.replace(": ", "")

def _parse_meeting_schedule_info(self, committee_meta):
"""Parses out the weekday, and frequency of the meeting for calculating
future dates"""
# Add special case for downtown downtown meetings are the day before city
# planning, so we calculate using the city planning schedule (1, and 3rd
# Friday) and set a flag so we can subtract a day from the results
committee_str = " ".join(committee_meta.css("p.mb-1::text").extract())
is_downtown = "prior to the City Planning Commission" in committee_str

if is_downtown:
weekday = 4
chosen_ordinals = [0, 2]
else:
weekday_str = committee_meta.css("p.mb-1 strong::text").extract_first()
weekday = self._parse_weekday(weekday_str)
raw_weeks = re.findall(r"1st|2nd|3rd|4th", committee_str)
# ordinals here just refer to the 1st, 2nd etc...
chosen_ordinals = [self._parse_ordinal(ordinal) for ordinal in raw_weeks]
return weekday, chosen_ordinals, is_downtown

def _parse_weekday(self, weekday):
"""Parses weekday strings as their integer equivalent"""
# we cut off the last char of weekday, because it comes through with
# an 's' i.e. 'Tuesdays'
return time.strptime(weekday[:-1], "%A").tm_wday

def _parse_ordinal(self, ordinal_str):
"""Parses ordinals as their integer equivalent beginning from 0"""
ordinal_lookup = {"1st": 0, "2nd": 1, "3rd": 2, "4th": 3}
return ordinal_lookup[ordinal_str.lower()]
1 change: 1 addition & 0 deletions city_scrapers/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .meeting_date_calculator import calculate_upcoming_meeting_days # noqa
Loading