1
+ import os
2
+ import urllib .request
3
+ import json
4
+
5
+ path = os .path .dirname (__file__ )+ '/'
6
+ #req = 'my%20little%20pony' # Search request
7
+ GCS_KEY = 'xxx' # Google Custom Search API key
8
+ GCS_CX = 'zzz' # search engine
9
+ gcsMaxResults = 30 # should be < 100 and (%10 == 0)
10
+ filename = 'gcsRes.json'
11
+
12
+ def create_url (gcs_req , start = 1 ):
13
+ gcs_req = gcs_req .replace (' ' ,'%20' )
14
+ return 'https://www.googleapis.com/customsearch/v1?key=' + GCS_KEY + '&cx=' + GCS_CX + '&num=10&start=' + str (start )+ '&q=' + gcs_req
15
+
16
+ def get_url (url ):
17
+ s = 'error'
18
+ try :
19
+ f = urllib .request .urlopen (url )
20
+ s = f .read ()
21
+ except urllib .error .HTTPError :
22
+ s = 'connect error'
23
+ except urllib .error .URLError :
24
+ s = 'url error'
25
+ return s
26
+
27
+ def get_json (gcs_req , mode = 'url' , start = 1 , filename = 'gcsRes.json' ):
28
+ """gets JSON formatted string from url or file
29
+ args:
30
+ gcsReq - search request
31
+ mode - 'url' - open url, 'save' - open url + save to file, 'file - open file'
32
+ return:
33
+ JSON formatted string
34
+ """
35
+ if mode == 'url' or mode == 'save' :
36
+ req_url = create_url (gcs_req , start )
37
+ page = get_url (req_url )
38
+ if page == 'connect error' or page == 'url error' :
39
+ page = ('{ "kind" : "' + page + '" }' ).encode ('utf-8' )
40
+ if mode == 'save' :
41
+ with open (path + filename , 'wb' ) as f :
42
+ f .write (page )
43
+ return page
44
+ elif mode == 'file' :
45
+ with open (filename , 'rb' ) as f :
46
+ page = json .loads (f .read ())
47
+ return page
48
+
49
+ def is_next_page (dict ):
50
+ if dict ['queries' ].get ('nextPage' ) == None : return False
51
+ else : return True
52
+
53
+ def total_results (dict ):
54
+ return int (dict ['searchInformation' ]['totalResults' ])
55
+
56
+ def next_page_start (dict ):
57
+ return dict ['queries' ]['nextPage' ][0 ]['startIndex' ]
58
+
59
+ def search_request (dict ):
60
+ return dict ['queries' ]['request' ][0 ]['searchTerms' ]
61
+
62
+ def page_links (dict ):
63
+ links = []
64
+ for result in dict ['items' ]:
65
+ links .append (result ['link' ])
66
+ return links
67
+
68
+ def all_links (gcs_req , show_info = False ):
69
+ gcsDict = {}
70
+ links = []
71
+ resStart = 1
72
+ first_page = True
73
+ while first_page == True or is_next_page (gcsDict ) == True and next_page_start (gcsDict ) < gcsMaxResults :
74
+ if first_page == True : first_page = False
75
+ gcsDict = json .loads (get_json (gcs_req , 'save' , resStart ))
76
+ if (gcsDict ['kind' ] != 'connect error' ):
77
+ if total_results (gcsDict ) > 0 :
78
+ links += page_links (gcsDict )
79
+ if is_next_page (gcsDict ) == True :
80
+ resStart = next_page_start (gcsDict )
81
+ if show_info == True : print ('Request: ' + search_request (gcsDict )+ ' Results: ' + str (len (links )))
82
+ return links
0 commit comments