-
Notifications
You must be signed in to change notification settings - Fork 26
/
scrapeGita.py
89 lines (65 loc) · 2.79 KB
/
scrapeGita.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import bs4
import unicodedata
import codecs
import string
import re
import csv
def getGita(chap, verse):
quote_page = "https://www.holy-bhagavad-gita.org/chapter/"+str(chap)+"/verse/" + str(verse)
req = Request(quote_page, headers={'User-Agent': 'Mozilla/5.0'})
# query the website and return the html to the variable ‘page’
try:
page = urlopen(req).read()
except Exception as e:
print("page that failed was: " + quote_page)
raise e
# parse the html using beautiful soup and store in variable `soup`
soup = BeautifulSoup(page, 'html.parser')
sa = soup.find(id="originalVerse") #finds sanskrit
pTagSA = sa.p
pTagSA = [str(elm) for elm in pTagSA.contents if (type(elm) == bs4.element.NavigableString)]#this only keeps strings from the HTML
#print(pTagSA)
SA = ''.join(re.sub(r"\d+\|\|","",str(elem)) for elem in pTagSA) # this removes the | characters found in the sanskrit
SA = ''.join([i for i in SA if not i.isdigit()]) #idk if this is needed but it takes random verse numbers from the end of the sanskrit
#print()
#print(SA)
#-----------------------------------------
en = soup.find(id="translation") #finds the english
pTagEN = en.p
#for con in pTagEN.contents:
# if type(con) == bs4.element.Tag:
# print(str(con.get('id'))+"-"+str(con))
#pTagEN = [str(elm) for elm in pTagEN.contents if (type(elm) == bs4.element.NavigableString) ] #this only keeps strings from the HTML
enL = []
for elm in pTagEN.contents:
strElm = str(elm)
if (type(elm) == bs4.element.NavigableString):
enL.append(strElm)
if type(elm) == bs4.element.Tag:
if "<i>" in strElm:
strElm = re.sub(r"\</i\>","",strElm)
strElm = re.sub(r"\<i\>","",strElm)
enL.append(strElm)
#print(pTagEN)
#print()
EN = ''.join(str(elm).replace('\n','') for elm in enL)
#EN = pTagEN[1].replace('\n','') #removes newlines from the english
#this gets rid of the punctuation
#exclude = set(string.punctuation)
#EN = ''.join(ch for ch in EN if ch not in exclude)
#print(EN)
mId = "c:"+str(chap)+"v"+str(verse)
return (mId,SA,EN)
if __name__ == '__main__':
versesInChap = [46,72,43,42,29,47,30,28,34,42,55,20,35,27,20,24,28,78]
#print(getGita(5,3))
with open('data.csv','w') as out:
csv_out=csv.writer(out)
csv_out.writerow(["id","SA","EN"])
for i in range(1, 19): #chap
for j in range(1, versesInChap[i-1]+1): #verse
print("chap: " +str(i)+"verse: " +str(j))
csv_out.writerow(getGita(i,j))