Skip to content

Commit

Permalink
Fix CivPlus bugs #176 #189
Browse files Browse the repository at this point in the history
  • Loading branch information
zstumgoren committed Sep 6, 2024
1 parent a7775f1 commit d27e071
Show file tree
Hide file tree
Showing 4 changed files with 2,356 additions and 5 deletions.
17 changes: 14 additions & 3 deletions civic_scraper/platforms/civic_plus/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def file_links_with_no_title(tag):
)

metadata = []
# Links often appear twice (once under meeting title, once in download menu)
# so we track which we've already seen to avoid duplicate entries
bookkeeping = set()
for div in divs:
cmte_name = self._committee_name(div)
# Line-item data for each meeting is inside table rows.
Expand All @@ -52,6 +55,9 @@ def file_links_with_no_title(tag):
# Skip links to page listing previous agenda versions
if self._previous_version_link(link):
continue
# Skip previously harvested links
if link["href"] in bookkeeping:
continue
metadata.append(
{
"committee_name": cmte_name,
Expand All @@ -63,13 +69,18 @@ def file_links_with_no_title(tag):
"asset_type": self._asset_type(link["href"]),
}
)
bookkeeping.add(link["href"])
return metadata

def _committee_name(self, div):
# Remove span that contains
# If present, remove span that contains
# arrow ▼ for toggling meeting list
div.h2.span.extract()
return div.h2.text.strip()
try:
div.h2.span.extract()
except AttributeError:
pass
header_node = div.h2 or div.h3
return header_node.text.strip()

def _mtg_title(self, row):
return row.p.text.strip()
Expand Down
Loading

0 comments on commit d27e071

Please sign in to comment.