-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerriamScraper.py
124 lines (95 loc) · 3 KB
/
merriamScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#Luke De Vos
'''
Scrapes Merriam Webster's Online Dictionary (collegiate) for a requested word's (or text file of words) syllable structure and first listed pronunciation. If a word's first listed pronunciation depends on its part of speech, a line for each pronunciation and associated part of speech will be returned.
'''
import sys
import re
import urllib.request
def findSylls(raw):
try:
sylPattern = "\"word-syllables\">.*?<"
sylMatch = re.findall(sylPattern, raw)
sylMatch = re.findall(">.+?<", sylMatch[0])
sylMatch = re.sub("·​", "", sylMatch[0][1:-1])
return sylMatch
except:
return "n/a"
def findPhos(raw):
try:
phoPattern = "span class=\"pr\">.+?</"
phoMatch = re.findall(phoPattern, raw)[0]
phoMatch = re.findall(">.+?<", phoMatch)[0]
phoMatch = phoMatch[2:-1]
return phoMatch
except:
return "n/a"
def findPartOfSpeech(raw):
try:
posPattern = "<a class=\"important-blue-link\" href=\"/dictionary/.+?\">.+?</a>"
partMatch = re.search(posPattern, raw).group(0)
partMatch = partMatch[49:] #trim leading characters
partMatch = re.search(".+?\"", partMatch).group(0)
partMatch = partMatch[0:-1] #trim trailing quote
return partMatch
except:
return "n/a"
'''
#MAIN ===============================================================
'''
#requested word(s)
requestL=[]
if sys.argv[1][-4:] == '.txt':
try:
with open(sys.argv[1]) as file:
for line in file:
requestL.append(line[:-1])
except:
print("File not found")
exit()
else:
requestL = [sys.argv[1].lower()]
#
for request in requestL:
wordForURL = re.sub(' ','%20', request)
wordForURL = re.sub('\'','%27', wordForURL)
try:
fp = urllib.request.urlopen("https://www.merriam-webster.com/dictionary/" + wordForURL)
except:
print('\''+ request + '\' INVALID URL')
continue
#extract html
mybytes = fp.read()
htmlText = mybytes.decode("utf8")
fp.close()
#split all htmlText by 'hword'
#for each resulting text block, take first pr and word-syllables
htmlText = htmlText.replace('\n', '')
posDict = {}
phoL = [] #keeps track of recorded pronunciations to avoid redundant entries
for textBlock in htmlText.split("class=\"hword\"")[1:]:
#ensure headword matches requested word. sometimes the url for words like "placed" may provide the page for the word "place"
hword = re.search(">.+?<", textBlock).group(0)
hword = hword[1:-1]
if request != hword:
break
phoMatch = findPhos(textBlock)
if phoMatch == 'n/a' or phoMatch in phoL or 'span class' in phoMatch:
continue
phoL.append(phoMatch)
if '-' in phoMatch: #thus >1 syllables
syllMatch = findSylls(textBlock)
else:
syllMatch = request
partMatch = findPartOfSpeech(textBlock)
if partMatch != 'n/a':
if partMatch not in posDict:
posDict[partMatch] = [syllMatch, phoMatch]
if not posDict:
print(request + '\tN/A')
else:
for key,value in posDict.items():
if len(posDict) < 2:
partOfSpeech = '_'
else:
partOfSpeech = key
print(request + '\t' + value[0] + '\t' + value[1] + '\t' + partOfSpeech)