diff --git a/MLS_scraper_test_clear_spans.py b/MLS_scraper_test_clear_spans.py new file mode 100644 index 0000000..905865b --- /dev/null +++ b/MLS_scraper_test_clear_spans.py @@ -0,0 +1,78 @@ +from urllib.request import urlopen +from bs4 import BeautifulSoup +import time +import csv + +# this version is able to get rid of the spans inside age/birthday +# and birthplace -- see lines 63-67 + +csvfile = open("mls_players_test.csv", 'w', newline='', encoding='utf-8') +c = csv.writer(csvfile) +# write the header row for CSV file +c.writerow(['title', 'team', 'position', 'birthday', 'birthplace', 'twitter']) + +html = urlopen("http://www.mlssoccer.com/players") +bsObj = BeautifulSoup(html, "html.parser") +player_list = [] + +# player links are on multiple pages -- get the next page URL +def get_next_page(html, bsObj): + next_page = bsObj.find( "a", {"title":"Go to next page"} ) + if next_page and ('href' in next_page.attrs): + partial = str(next_page.attrs['href']) + new_url = "http://www.mlssoccer.com" + partial + html = urlopen(new_url) + bsObj = BeautifulSoup(html, "html.parser") + get_player_pages(html, bsObj) + else: + print("Done collecting URLs ...") + +# run this on each page to get player detail page links +def get_player_pages(html, bsObj): + global player_list + tag_list = bsObj.findAll( "a", {"class":"row_link"} ) + for tag in tag_list: + if 'href' in tag.attrs: + player_list.append(str(tag.attrs['href'])) + # time.sleep(1) # decided I don't need this delay + # get_next_page(html, bsObj) + +def get_player_details(player_list): + counter = 0 + for player in player_list: + new_url = "http://www.mlssoccer.com" + player + html = urlopen(new_url) + bsObj = BeautifulSoup(html, "html.parser") + bsObj.span.decompose() + player_details = [] + title = bsObj.find( "div", {"class":"title"} ) + team = bsObj.find( "div", {"class":"club"} ) + position = bsObj.find( "span", {"class":"position"} ) + # had div for position - should be span - oops + birthday = bsObj.find( "div", {"class":"age"} ) + #