Skip to content

Commit

Permalink
Build a minimalistic Click cli around scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
JOJ0 committed Aug 6, 2024
1 parent f85224a commit c549c2c
Showing 1 changed file with 33 additions and 19 deletions.
52 changes: 33 additions & 19 deletions scrape_docs.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,45 @@
#!/usr/bin/env python

import click
from bs4 import BeautifulSoup
import requests
import re
import pprint as p

@click.command()
@click.option(
'--output', '-o', default='default', type=click.Choice(['default', 'csv']),
show_choices=True, help=f'''Output format "default" prints human readable
on shell, "csv" is a two-column comma separated value format.''')
@click.argument('URL')
def scrape(output, url):
'''Scrape one chapter of Admin API docs and spit out in various formats.
chapter = 'https://element-hq.github.io/synapse/develop/admin_api/rooms.html'
apidoc = requests.get(chapter).text
soup = BeautifulSoup(apidoc, 'html.parser')
URL is the address of the Synapse Admin API docs chapter. For example:
https://element-hq.github.io/synapse/develop/admin_api/rooms.html'''
chapter = url
apidoc = requests.get(chapter).text
soup = BeautifulSoup(apidoc, 'html.parser')

elements = soup.find_all(
['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a'],
)
elements = soup.find_all(
['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a'],
)

#p.pprint(elements)
for e in elements:
if e.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
print(f'HEADLINE {e.name}: {e.text}')
if e.name == 'a':
if e.parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
link = e['href']
print(f'{e.text} {link}')
print()

print()
print()
print()
#p.pprint(elements)
for e in elements:
if e.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
print(f'HEADLINE {e.name}: {e.text}')
if e.name == 'a':
if e.parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
link = e['href']
print(f'{e.text} {link}')
print()

print()
print()
print()
#print(soup.prettify())


if __name__ == '__main__':
scrape()

0 comments on commit c549c2c

Please sign in to comment.