Add a scraper gathering Admin API docs links

A first draft. Gets all anchors but only if a direct parent is a header tag.
JOJ0 · Aug 5, 2024 · f85224a · f85224a
1 parent 7deb008
commit f85224a
Showing 1 changed file with 31 additions and 0 deletions.
diff --git a/scrape_docs.py b/scrape_docs.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+
+from bs4 import BeautifulSoup
+import requests
+import re
+import pprint as p
+
+
+chapter = 'https://element-hq.github.io/synapse/develop/admin_api/rooms.html'
+apidoc = requests.get(chapter).text
+soup = BeautifulSoup(apidoc, 'html.parser')
+
+elements = soup.find_all(
+        ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a'],
+)
+
+#p.pprint(elements)
+for e in elements:
+    if e.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+        print(f'HEADLINE {e.name}: {e.text}')
+    if e.name == 'a':
+        if e.parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+            link = e['href']
+            print(f'{e.text} {link}')
+            print()
+
+print()
+print()
+print()
+
+#print(soup.prettify())