1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "private_outputs" : true ,
7
+ "provenance" : [],
8
+ "collapsed_sections" : []
9
+ },
10
+ "kernelspec" : {
11
+ "name" : " python3" ,
12
+ "display_name" : " Python 3"
13
+ },
14
+ "language_info" : {
15
+ "name" : " python"
16
+ }
17
+ },
18
+ "cells" : [
19
+ {
20
+ "cell_type" : " markdown" ,
21
+ "source" : [
22
+ " install selenium\n " ,
23
+ " and\n " ,
24
+ " chrome driver (also define path)"
25
+ ],
26
+ "metadata" : {
27
+ "id" : " -lYAwjJwe4Oa"
28
+ }
29
+ },
30
+ {
31
+ "cell_type" : " code" ,
32
+ "source" : [
33
+ " !pip install selenium\n " ,
34
+ " !apt-get update # to update ubuntu to correctly run apt install\n " ,
35
+ " !apt install chromium-chromedriver\n " ,
36
+ " !cp /usr/lib/chromium-browser/chromedriver /usr/bin\n " ,
37
+ " import sys\n " ,
38
+ " sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"
39
+ ],
40
+ "metadata" : {
41
+ "id" : " rjyqtRrYewxu"
42
+ },
43
+ "execution_count" : null ,
44
+ "outputs" : []
45
+ },
46
+ {
47
+ "cell_type" : " markdown" ,
48
+ "source" : [
49
+ " install tesseract"
50
+ ],
51
+ "metadata" : {
52
+ "id" : " FFYAh381exh4"
53
+ }
54
+ },
55
+ {
56
+ "cell_type" : " code" ,
57
+ "source" : [
58
+ " !sudo apt install tesseract-ocr\n " ,
59
+ " !pip install pytesseract"
60
+ ],
61
+ "metadata" : {
62
+ "id" : " 2_AwFU5SQ5lT"
63
+ },
64
+ "execution_count" : null ,
65
+ "outputs" : []
66
+ },
67
+ {
68
+ "cell_type" : " markdown" ,
69
+ "source" : [
70
+ " restart runtime"
71
+ ],
72
+ "metadata" : {
73
+ "id" : " MBT77b1SBIWq"
74
+ }
75
+ },
76
+ {
77
+ "cell_type" : " code" ,
78
+ "source" : [
79
+ " import os\n " ,
80
+ " os.kill(os.getpid(), 9)\n " ,
81
+ " #-----------OR-----------\n " ,
82
+ " # quit()\n " ,
83
+ " #-----------OR-----------\n " ,
84
+ " # exit()"
85
+ ],
86
+ "metadata" : {
87
+ "id" : " LQu1QserAnKc"
88
+ },
89
+ "execution_count" : null ,
90
+ "outputs" : []
91
+ },
92
+ {
93
+ "cell_type" : " markdown" ,
94
+ "source" : [
95
+ " import dependancies"
96
+ ],
97
+ "metadata" : {
98
+ "id" : " QQovnVXAfNbx"
99
+ }
100
+ },
101
+ {
102
+ "cell_type" : " code" ,
103
+ "source" : [
104
+ " from selenium import webdriver\n " ,
105
+ " from selenium.webdriver.common.by import By\n " ,
106
+ " from selenium.webdriver.common.keys import Keys\n " ,
107
+ " from selenium.webdriver.support.select import Select\n " ,
108
+ " from selenium.webdriver.chrome.service import Service\n " ,
109
+ " \n " ,
110
+ " import cv2\n " ,
111
+ " from PIL import Image, ImageCms, ImageFilter\n " ,
112
+ " import pytesseract\n " ,
113
+ " \n " ,
114
+ " import pandas as pd\n " ,
115
+ " import warnings\n " ,
116
+ " warnings.filterwarnings('ignore')"
117
+ ],
118
+ "metadata" : {
119
+ "id" : " RhvflwoJfMZZ"
120
+ },
121
+ "execution_count" : null ,
122
+ "outputs" : []
123
+ },
124
+ {
125
+ "cell_type" : " markdown" ,
126
+ "source" : [
127
+ " helper functions"
128
+ ],
129
+ "metadata" : {
130
+ "id" : " T2jcfxOqfdvk"
131
+ }
132
+ },
133
+ {
134
+ "cell_type" : " code" ,
135
+ "source" : [
136
+ " def step1():\n " ,
137
+ " # open webpage\n " ,
138
+ " driver.get(URL)\n " ,
139
+ " \n " ,
140
+ " # save captcha\n " ,
141
+ " imdata = driver.find_element(By.ID,\" imgCaptcha\" )\n " ,
142
+ " with open(path, 'wb') as file:\n " ,
143
+ " file.write(imdata.screenshot_as_png)\n " ,
144
+ " \n " ,
145
+ " def step2():\n " ,
146
+ " # convert to inverted mask and save img_temp\n " ,
147
+ " im = cv2.imread(path)\n " ,
148
+ " gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n " ,
149
+ " thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n " ,
150
+ " horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n " ,
151
+ " Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n " ,
152
+ " #Mask = cv2.bitwise_not(Mask)\n " ,
153
+ " cv2.imwrite(\" old.png\" , Mask)\n " ,
154
+ " \n " ,
155
+ " # open img_temp and reinvert mask\n " ,
156
+ " img = Image.open(\" old.png\" )\n " ,
157
+ " img = img.convert(\" RGBA\" )\n " ,
158
+ " datas = img.getdata()\n " ,
159
+ " newData = []\n " ,
160
+ " for item in datas:\n " ,
161
+ " if item[0] == 0 and item[1] == 0 and item[2] == 0:\n " ,
162
+ " newData.append((255, 255, 255, 0))\n " ,
163
+ " else:\n " ,
164
+ " newData.append(item)\n " ,
165
+ " img.putdata(newData)\n " ,
166
+ " \n " ,
167
+ " # paste mask on img and save new_temp_img\n " ,
168
+ " background = Image.open(path)\n " ,
169
+ " background = background.convert(\" RGBA\" )\n " ,
170
+ " background.paste(img,mask=img)\n " ,
171
+ " background.save(\" new.png\" ,\" PNG\" )\n " ,
172
+ " \n " ,
173
+ " def step3(im): # solve captcha\n " ,
174
+ " im = Image.open(im) # open last saved img\n " ,
175
+ " im = im.crop((5,5,115,35)) # crop it\n " ,
176
+ " # conver image to extractable form elements (deffer captcha styles)\n " ,
177
+ " rgb = ImageCms.createProfile(colorSpace='sRGB')\n " ,
178
+ " lab = ImageCms.createProfile(colorSpace='LAB')\n " ,
179
+ " transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n " ,
180
+ " lab_im = ImageCms.applyTransform(im=im, transform=transform)\n " ,
181
+ " l, a, b = lab_im.split()\n " ,
182
+ " im=l # select an element which is most extractable\n " ,
183
+ " im = im.filter(ImageFilter.MinFilter(3)) # filter it\n " ,
184
+ " result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n " ,
185
+ " l=[]\n " ,
186
+ " l.append(result.strip())\n " ,
187
+ " if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n " ,
188
+ " step1()\n " ,
189
+ " step2()\n " ,
190
+ " l[0]=step3(\" new.png\" )\n " ,
191
+ " return l[0] # return final result (maybe right or wrong)\n " ,
192
+ " \n " ,
193
+ " def step4(enroll,ans): # return data\n " ,
194
+ " # site automation \n " ,
195
+ " sel = Select (driver.find_element(By.ID,\" ddlbatch\" )) # focus on select element\n " ,
196
+ " sel.select_by_value(exam) # select element by giving id (specific for a exam)\n " ,
197
+ " enr = driver.find_element(By.ID,\" txtenroll\" ) # get enrollment no. text box\n " ,
198
+ " captex = driver.find_element(By.ID,\" CodeNumberTextBox\" ) # get captcha text box\n " ,
199
+ " enr.send_keys(enroll) # send (type) given enrollment number to text box\n " ,
200
+ " captex.send_keys(ans) # send (type) extracted captcha text to text box\n " ,
201
+ " captex.send_keys(Keys.RETURN) # return (ENTER)\n " ,
202
+ " \n " ,
203
+ " ere = driver.find_element(By.ID,\" lblmsg\" ).text\n " ,
204
+ " if ere == \" ERROR: Incorrect captcha code, try again.\" : \n " ,
205
+ " return \" err\"\n " ,
206
+ " if ere == \" Your request count is reached to maximum limit, Please try again later.\" : \n " ,
207
+ " return \" reqover\"\n " ,
208
+ " if ere == \" Oppssss! Data not available.\" : \n " ,
209
+ " return \" nodata\"\n " ,
210
+ " \n " ,
211
+ " name = driver.find_element(By.ID,\" lblName\" ).text\n " ,
212
+ " sess = driver.find_element(By.ID,\" lblSession\" ).text\n " ,
213
+ " dd = driver.find_element(By.ID,\" lblDeclaredOn\" ).text\n " ,
214
+ " bra = driver.find_element(By.ID,\" lblBranchName\" ).text\n " ,
215
+ " cs = driver.find_element(By.ID,\" lblExamName\" ).text\n " ,
216
+ " csb = driver.find_element(By.ID,\" lblCUPBack\" ).text\n " ,
217
+ " tb = driver.find_element(By.ID,\" lblTotalBack\" ).text\n " ,
218
+ " spi = driver.find_element(By.ID,\" lblSPI\" ).text\n " ,
219
+ " cpi = driver.find_element(By.ID,\" lblCPI\" ).text\n " ,
220
+ " cgpa = driver.find_element(By.ID,\" lblCGPA\" ).text\n " ,
221
+ " cp = driver.find_element(By.ID,\" pt100Curr\" ).text\n " ,
222
+ " cup = driver.find_element(By.ID,\" pt100Cuml\" ).text\n " ,
223
+ " return [enroll,name,sess,cs,dd,bra,int(csb),int(tb),float(spi),float(cpi),float(cgpa),int(cp),int(cup),ere]\n " ,
224
+ " \n " ,
225
+ " def loop():\n " ,
226
+ " # just a loop through different enrollment numbers\n " ,
227
+ " global counter\n " ,
228
+ " mynewlist = []\n " ,
229
+ " for i in mylist :\n " ,
230
+ " enroll = \" {}\" .format(i)\n " ,
231
+ " step1()\n " ,
232
+ " step2()\n " ,
233
+ " ans=step3(\" new.png\" )\n " ,
234
+ " nr=step4(enroll,ans)\n " ,
235
+ " if nr == \" err\" :\n " ,
236
+ " mynewlist.append(enroll)\n " ,
237
+ " elif nr == \" reqover\" :\n " ,
238
+ " print(\" Change the SERVER!\" )\n " ,
239
+ " break\n " ,
240
+ " elif nr == \" nodata\" :\n " ,
241
+ " df.loc[len(df)] = [enroll,\" nodata\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ,\" -\" ]\n " ,
242
+ " counter += 1\n " ,
243
+ " print(f\" {counter}/{tc} {int(counter*100/tc)}%\" )\n " ,
244
+ " else :\n " ,
245
+ " df.loc[len(df)] = nr\n " ,
246
+ " counter += 1\n " ,
247
+ " print(f\" {counter}/{tc} {int(counter*100/tc)}%\" )\n " ,
248
+ " return mynewlist"
249
+ ],
250
+ "metadata" : {
251
+ "id" : " t9H8OdFrRLBY"
252
+ },
253
+ "execution_count" : null ,
254
+ "outputs" : []
255
+ },
256
+ {
257
+ "cell_type" : " markdown" ,
258
+ "source" : [
259
+ " main function"
260
+ ],
261
+ "metadata" : {
262
+ "id" : " 5QIu2udZgKKq"
263
+ }
264
+ },
265
+ {
266
+ "cell_type" : " code" ,
267
+ "execution_count" : null ,
268
+ "metadata" : {
269
+ "id" : " E_BfboOxLMJd"
270
+ },
271
+ "outputs" : [],
272
+ "source" : [
273
+ " try:\n " ,
274
+ " \t # initiate webdriver and configure options\n " ,
275
+ " \t chrome_options = webdriver.ChromeOptions()\n " ,
276
+ " \t chrome_options.add_argument('--headless')\n " ,
277
+ " \t chrome_options.add_argument('--no-sandbox')\n " ,
278
+ " \t chrome_options.add_argument('--disable-dev-shm-usage')\n " ,
279
+ " \t chrome_options.add_argument(\" --incognito\" )\n " ,
280
+ " \n " ,
281
+ " \t ser = Service(\" chromedriver\" )\n " ,
282
+ " \t driver = webdriver.Chrome(service=ser,options=chrome_options)\n " ,
283
+ " \n " ,
284
+ " \t # define url and filename for download captcha_temp\n " ,
285
+ " \t URL = \" https://www.gturesults.in/\"\n " ,
286
+ " \t exam = \" 3361$S2022$2022-08-25$current$0\"\n " ,
287
+ " \t path=\" cap.jpg\"\n " ,
288
+ " \n " ,
289
+ " \t # create empty dataframe for filling output data with same labels that input file has\n " ,
290
+ " \t df = pd.read_json('{\" Enrollment No.\" :{},\" Name\" :{},\" Session\" :{},\" Exam\" :{},\" Declared On\" :{},\" Branch\" :{},\" Current Sem. Backlog\" :{},\" Total Backlog\" :{},\" SPI\" :{},\" CPI\" :{},\" CGPA\" :{},\" Current Points\" :{},\" Cumulative points\" :{},\" Message\" :{}}')\n " ,
291
+ " \n " ,
292
+ " \t mylist = range(190280111001,190280111010+1) # give range of enrollment no. (here i given our batch's range)\n " ,
293
+ " \t\n " ,
294
+ " \t counter = 0\n " ,
295
+ " \t tc = len(mylist)\n " ,
296
+ " \n " ,
297
+ " \t # main driver programm\n " ,
298
+ " \t # loop runs untill all data has scraped if any server error not happens\n " ,
299
+ " \t while 1:\n " ,
300
+ " \t\t mynewlist=loop()\n " ,
301
+ " \t\t if len(mynewlist) != 0:\n " ,
302
+ " \t\t\t mylist = mynewlist\n " ,
303
+ " \t\t else:\n " ,
304
+ " \t\t\t break\n " ,
305
+ " \n " ,
306
+ " \t # save dataframe to excel file\n " ,
307
+ " \t df.to_excel(\" out.xlsx\" )\n " ,
308
+ " \n " ,
309
+ " finally:\n " ,
310
+ " \t driver.close() # close the window\n " ,
311
+ " \t driver.quit() # stop the driver\n " ,
312
+ " \t # remove unnecessary files\n " ,
313
+ " \t import os\n " ,
314
+ " \t os.remove(\" cap.jpg\" )\n " ,
315
+ " \t os.remove(\" old.png\" )\n " ,
316
+ " \t os.remove(\" new.png\" )"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type" : " markdown" ,
321
+ "source" : [
322
+ " 👈 download <font color='yellow'>out.xlsx</font> from left side bar by double clicking it"
323
+ ],
324
+ "metadata" : {
325
+ "id" : " qgkULANw3ZQ2"
326
+ }
327
+ }
328
+ ]
329
+ }
0 commit comments