Skip to content

Commit 8791f5f

Browse files
committed
converted version to just run and download output (On Colab)
1 parent 3340658 commit 8791f5f

File tree

12 files changed

+670
-344
lines changed

12 files changed

+670
-344
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ By using selenium and pytesseract ocr module we can scrap any Text Captcha Human
1313

1414
Used Pandas for managing data in tabular format.
1515

16-
Here i provided source codes for scrapping data for analytics.
16+
Here i provided source codes for scrapping data and further used for analytics.
1717

18-
### youtube :
18+
### youtube (version 1) :
1919

2020
https://youtu.be/2nPUuaq4RRI (gturesults.in)
2121

gturesults.in/README.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
1-
# Note :
1+
# Result-Data-Analyzer
22

3-
To run this in your system
3+
## How to run :
44

5-
1> you have to install tesseract-ocr in your system
5+
1) Open notebook file in colab
6+
2) Hit CTRL+F9 (Run All)
7+
3) Resume run using CTRL+F10 (Run After) (After Runtime Restart)
8+
4) Download output file
69

7-
2> you have to download your browser's driver file
10+
#### Scrap, Analyze & Enjoy!
811

9-
here i use chrome browser in incognito mode so i downloaded chromedriver.exe
10-
11-
the driver is version specific
12-
13-
you have to define path for this driverfile in program
12+
#### Thank You!

gturesults.in/g.xlsx

-9.3 KB
Binary file not shown.

gturesults.in/out.xlsx

-20.4 KB
Binary file not shown.
Lines changed: 329 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,329 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"private_outputs": true,
7+
"provenance": [],
8+
"collapsed_sections": []
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
},
14+
"language_info": {
15+
"name": "python"
16+
}
17+
},
18+
"cells": [
19+
{
20+
"cell_type": "markdown",
21+
"source": [
22+
"install selenium\n",
23+
"and\n",
24+
"chrome driver (also define path)"
25+
],
26+
"metadata": {
27+
"id": "-lYAwjJwe4Oa"
28+
}
29+
},
30+
{
31+
"cell_type": "code",
32+
"source": [
33+
"!pip install selenium\n",
34+
"!apt-get update # to update ubuntu to correctly run apt install\n",
35+
"!apt install chromium-chromedriver\n",
36+
"!cp /usr/lib/chromium-browser/chromedriver /usr/bin\n",
37+
"import sys\n",
38+
"sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')"
39+
],
40+
"metadata": {
41+
"id": "rjyqtRrYewxu"
42+
},
43+
"execution_count": null,
44+
"outputs": []
45+
},
46+
{
47+
"cell_type": "markdown",
48+
"source": [
49+
"install tesseract"
50+
],
51+
"metadata": {
52+
"id": "FFYAh381exh4"
53+
}
54+
},
55+
{
56+
"cell_type": "code",
57+
"source": [
58+
"!sudo apt install tesseract-ocr\n",
59+
"!pip install pytesseract"
60+
],
61+
"metadata": {
62+
"id": "2_AwFU5SQ5lT"
63+
},
64+
"execution_count": null,
65+
"outputs": []
66+
},
67+
{
68+
"cell_type": "markdown",
69+
"source": [
70+
"restart runtime"
71+
],
72+
"metadata": {
73+
"id": "MBT77b1SBIWq"
74+
}
75+
},
76+
{
77+
"cell_type": "code",
78+
"source": [
79+
"import os\n",
80+
"os.kill(os.getpid(), 9)\n",
81+
"#-----------OR-----------\n",
82+
"# quit()\n",
83+
"#-----------OR-----------\n",
84+
"# exit()"
85+
],
86+
"metadata": {
87+
"id": "LQu1QserAnKc"
88+
},
89+
"execution_count": null,
90+
"outputs": []
91+
},
92+
{
93+
"cell_type": "markdown",
94+
"source": [
95+
"import dependancies"
96+
],
97+
"metadata": {
98+
"id": "QQovnVXAfNbx"
99+
}
100+
},
101+
{
102+
"cell_type": "code",
103+
"source": [
104+
"from selenium import webdriver\n",
105+
"from selenium.webdriver.common.by import By\n",
106+
"from selenium.webdriver.common.keys import Keys\n",
107+
"from selenium.webdriver.support.select import Select\n",
108+
"from selenium.webdriver.chrome.service import Service\n",
109+
"\n",
110+
"import cv2\n",
111+
"from PIL import Image, ImageCms, ImageFilter\n",
112+
"import pytesseract\n",
113+
"\n",
114+
"import pandas as pd\n",
115+
"import warnings\n",
116+
"warnings.filterwarnings('ignore')"
117+
],
118+
"metadata": {
119+
"id": "RhvflwoJfMZZ"
120+
},
121+
"execution_count": null,
122+
"outputs": []
123+
},
124+
{
125+
"cell_type": "markdown",
126+
"source": [
127+
"helper functions"
128+
],
129+
"metadata": {
130+
"id": "T2jcfxOqfdvk"
131+
}
132+
},
133+
{
134+
"cell_type": "code",
135+
"source": [
136+
"def step1():\n",
137+
" # open webpage\n",
138+
" driver.get(URL)\n",
139+
"\n",
140+
" # save captcha\n",
141+
" imdata = driver.find_element(By.ID,\"imgCaptcha\")\n",
142+
" with open(path, 'wb') as file:\n",
143+
" file.write(imdata.screenshot_as_png)\n",
144+
"\n",
145+
"def step2():\n",
146+
" # convert to inverted mask and save img_temp\n",
147+
" im = cv2.imread(path)\n",
148+
" gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)\n",
149+
" thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]\n",
150+
" horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 1))\n",
151+
" Mask = cv2.morphologyEx(thresh, cv2.MORPH_OPEN,horizontal_kernel, iterations=2)\n",
152+
" #Mask = cv2.bitwise_not(Mask)\n",
153+
" cv2.imwrite(\"old.png\", Mask)\n",
154+
"\n",
155+
" # open img_temp and reinvert mask\n",
156+
" img = Image.open(\"old.png\")\n",
157+
" img = img.convert(\"RGBA\")\n",
158+
" datas = img.getdata()\n",
159+
" newData = []\n",
160+
" for item in datas:\n",
161+
" if item[0] == 0 and item[1] == 0 and item[2] == 0:\n",
162+
" newData.append((255, 255, 255, 0))\n",
163+
" else:\n",
164+
" newData.append(item)\n",
165+
" img.putdata(newData)\n",
166+
"\n",
167+
" # paste mask on img and save new_temp_img\n",
168+
" background = Image.open(path)\n",
169+
" background = background.convert(\"RGBA\")\n",
170+
" background.paste(img,mask=img)\n",
171+
" background.save(\"new.png\",\"PNG\")\n",
172+
"\n",
173+
"def step3(im): # solve captcha\n",
174+
" im = Image.open(im) # open last saved img\n",
175+
" im = im.crop((5,5,115,35)) # crop it\n",
176+
" # conver image to extractable form elements (deffer captcha styles)\n",
177+
" rgb = ImageCms.createProfile(colorSpace='sRGB')\n",
178+
" lab = ImageCms.createProfile(colorSpace='LAB')\n",
179+
" transform = ImageCms.buildTransform(inputProfile=rgb, outputProfile=lab, inMode='RGB', outMode='LAB')\n",
180+
" lab_im = ImageCms.applyTransform(im=im, transform=transform)\n",
181+
" l, a, b = lab_im.split()\n",
182+
" im=l # select an element which is most extractable\n",
183+
" im = im.filter(ImageFilter.MinFilter(3)) # filter it\n",
184+
" result = pytesseract.image_to_string(im) # send it to ocr and save results to a variable\n",
185+
" l=[]\n",
186+
" l.append(result.strip())\n",
187+
" if l[0]==\" \" or l[0]==\"\" : # if result will be empty then it will do above steps again untill it gets the result\n",
188+
" step1()\n",
189+
" step2()\n",
190+
" l[0]=step3(\"new.png\")\n",
191+
" return l[0] # return final result (maybe right or wrong)\n",
192+
"\n",
193+
"def step4(enroll,ans): # return data\n",
194+
" # site automation \n",
195+
" sel = Select (driver.find_element(By.ID,\"ddlbatch\")) # focus on select element\n",
196+
" sel.select_by_value(exam) # select element by giving id (specific for a exam)\n",
197+
" enr = driver.find_element(By.ID,\"txtenroll\") # get enrollment no. text box\n",
198+
" captex = driver.find_element(By.ID,\"CodeNumberTextBox\") # get captcha text box\n",
199+
" enr.send_keys(enroll) # send (type) given enrollment number to text box\n",
200+
" captex.send_keys(ans) # send (type) extracted captcha text to text box\n",
201+
" captex.send_keys(Keys.RETURN) # return (ENTER)\n",
202+
"\n",
203+
" ere = driver.find_element(By.ID,\"lblmsg\").text\n",
204+
" if ere == \"ERROR: Incorrect captcha code, try again.\" : \n",
205+
" return \"err\"\n",
206+
" if ere == \"Your request count is reached to maximum limit, Please try again later.\" : \n",
207+
" return \"reqover\"\n",
208+
" if ere == \"Oppssss! Data not available.\" : \n",
209+
" return \"nodata\"\n",
210+
"\n",
211+
" name = driver.find_element(By.ID,\"lblName\").text\n",
212+
" sess = driver.find_element(By.ID,\"lblSession\").text\n",
213+
" dd = driver.find_element(By.ID,\"lblDeclaredOn\").text\n",
214+
" bra = driver.find_element(By.ID,\"lblBranchName\").text\n",
215+
" cs = driver.find_element(By.ID,\"lblExamName\").text\n",
216+
" csb = driver.find_element(By.ID,\"lblCUPBack\").text\n",
217+
" tb = driver.find_element(By.ID,\"lblTotalBack\").text\n",
218+
" spi = driver.find_element(By.ID,\"lblSPI\").text\n",
219+
" cpi = driver.find_element(By.ID,\"lblCPI\").text\n",
220+
" cgpa = driver.find_element(By.ID,\"lblCGPA\").text\n",
221+
" cp = driver.find_element(By.ID,\"pt100Curr\").text\n",
222+
" cup = driver.find_element(By.ID,\"pt100Cuml\").text\n",
223+
" return [enroll,name,sess,cs,dd,bra,int(csb),int(tb),float(spi),float(cpi),float(cgpa),int(cp),int(cup),ere]\n",
224+
"\n",
225+
"def loop():\n",
226+
" # just a loop through different enrollment numbers\n",
227+
" global counter\n",
228+
" mynewlist = []\n",
229+
" for i in mylist :\n",
230+
" enroll = \"{}\".format(i)\n",
231+
" step1()\n",
232+
" step2()\n",
233+
" ans=step3(\"new.png\")\n",
234+
" nr=step4(enroll,ans)\n",
235+
" if nr == \"err\" :\n",
236+
" mynewlist.append(enroll)\n",
237+
" elif nr == \"reqover\" :\n",
238+
" print(\"Change the SERVER!\")\n",
239+
" break\n",
240+
" elif nr == \"nodata\" :\n",
241+
" df.loc[len(df)] = [enroll,\"nodata\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\",\"-\"]\n",
242+
" counter += 1\n",
243+
" print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
244+
" else :\n",
245+
" df.loc[len(df)] = nr\n",
246+
" counter += 1\n",
247+
" print(f\"{counter}/{tc} {int(counter*100/tc)}%\")\n",
248+
" return mynewlist"
249+
],
250+
"metadata": {
251+
"id": "t9H8OdFrRLBY"
252+
},
253+
"execution_count": null,
254+
"outputs": []
255+
},
256+
{
257+
"cell_type": "markdown",
258+
"source": [
259+
"main function"
260+
],
261+
"metadata": {
262+
"id": "5QIu2udZgKKq"
263+
}
264+
},
265+
{
266+
"cell_type": "code",
267+
"execution_count": null,
268+
"metadata": {
269+
"id": "E_BfboOxLMJd"
270+
},
271+
"outputs": [],
272+
"source": [
273+
"try:\n",
274+
"\t# initiate webdriver and configure options\n",
275+
"\tchrome_options = webdriver.ChromeOptions()\n",
276+
"\tchrome_options.add_argument('--headless')\n",
277+
"\tchrome_options.add_argument('--no-sandbox')\n",
278+
"\tchrome_options.add_argument('--disable-dev-shm-usage')\n",
279+
"\tchrome_options.add_argument(\"--incognito\")\n",
280+
"\n",
281+
"\tser = Service(\"chromedriver\")\n",
282+
"\tdriver = webdriver.Chrome(service=ser,options=chrome_options)\n",
283+
"\n",
284+
"\t# define url and filename for download captcha_temp\n",
285+
"\tURL = \"https://www.gturesults.in/\"\n",
286+
"\texam = \"3361$S2022$2022-08-25$current$0\"\n",
287+
"\tpath=\"cap.jpg\"\n",
288+
" \n",
289+
"\t# create empty dataframe for filling output data with same labels that input file has\n",
290+
"\tdf = pd.read_json('{\"Enrollment No.\":{},\"Name\":{},\"Session\":{},\"Exam\":{},\"Declared On\":{},\"Branch\":{},\"Current Sem. Backlog\":{},\"Total Backlog\":{},\"SPI\":{},\"CPI\":{},\"CGPA\":{},\"Current Points\":{},\"Cumulative points\":{},\"Message\":{}}')\n",
291+
"\n",
292+
"\tmylist = range(190280111001,190280111010+1) # give range of enrollment no. (here i given our batch's range)\n",
293+
"\t\n",
294+
"\tcounter = 0\n",
295+
"\ttc = len(mylist)\n",
296+
" \n",
297+
"\t# main driver programm\n",
298+
"\t# loop runs untill all data has scraped if any server error not happens\n",
299+
"\twhile 1:\n",
300+
"\t\tmynewlist=loop()\n",
301+
"\t\tif len(mynewlist) != 0:\n",
302+
"\t\t\tmylist = mynewlist\n",
303+
"\t\telse:\n",
304+
"\t\t\tbreak\n",
305+
"\n",
306+
"\t# save dataframe to excel file\n",
307+
"\tdf.to_excel(\"out.xlsx\")\n",
308+
"\n",
309+
"finally:\n",
310+
"\tdriver.close() # close the window\n",
311+
"\tdriver.quit() # stop the driver\n",
312+
"\t# remove unnecessary files\n",
313+
"\timport os\n",
314+
"\tos.remove(\"cap.jpg\")\n",
315+
"\tos.remove(\"old.png\")\n",
316+
"\tos.remove(\"new.png\")"
317+
]
318+
},
319+
{
320+
"cell_type": "markdown",
321+
"source": [
322+
"👈 download <font color='yellow'>out.xlsx</font> from left side bar by double clicking it"
323+
],
324+
"metadata": {
325+
"id": "qgkULANw3ZQ2"
326+
}
327+
}
328+
]
329+
}

0 commit comments

Comments
 (0)