Skip to content

Commit

Permalink
Add a scraper gathering Admin API docs links
Browse files Browse the repository at this point in the history
A first draft. Gets all anchors but only if a direct parent is a header tag.
  • Loading branch information
JOJ0 committed Aug 5, 2024
1 parent 7deb008 commit f85224a
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions scrape_docs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python

from bs4 import BeautifulSoup
import requests
import re
import pprint as p


chapter = 'https://element-hq.github.io/synapse/develop/admin_api/rooms.html'
apidoc = requests.get(chapter).text
soup = BeautifulSoup(apidoc, 'html.parser')

elements = soup.find_all(
['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a'],
)

#p.pprint(elements)
for e in elements:
if e.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
print(f'HEADLINE {e.name}: {e.text}')
if e.name == 'a':
if e.parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
link = e['href']
print(f'{e.text} {link}')
print()

print()
print()
print()

#print(soup.prettify())

0 comments on commit f85224a

Please sign in to comment.