@@ -148,71 +148,77 @@ def gather_stage(self, harvest_job):
148
148
149
149
log .debug ('In DCATRDFHarvester gather_stage' )
150
150
151
- # Get file contents
152
- url = harvest_job .source .url
151
+ rdf_format = None
152
+ if harvest_job .source .config :
153
+ rdf_format = json .loads (harvest_job .source .config ).get ("rdf_format" )
153
154
154
- for harvester in p . PluginImplementations ( IDCATRDFHarvester ):
155
- url , before_download_errors = harvester . before_download ( url , harvest_job )
155
+ # Get file contents of first page
156
+ next_page_url = harvest_job . source . url
156
157
157
- for error_msg in before_download_errors :
158
- self . _save_gather_error ( error_msg , harvest_job )
158
+ guids_in_source = []
159
+ object_ids = []
159
160
160
- if not url :
161
- return False
161
+ while next_page_url :
162
+ for harvester in p .PluginImplementations (IDCATRDFHarvester ):
163
+ next_page_url , before_download_errors = harvester .before_download (next_page_url , harvest_job )
162
164
163
- rdf_format = None
164
- if harvest_job .source .config :
165
- rdf_format = json .loads (harvest_job .source .config ).get ("rdf_format" )
166
- content , rdf_format = self ._get_content_and_type (url , harvest_job , 1 , content_type = rdf_format )
165
+ for error_msg in before_download_errors :
166
+ self ._save_gather_error (error_msg , harvest_job )
167
167
168
- # TODO: store content?
169
- for harvester in p .PluginImplementations (IDCATRDFHarvester ):
170
- content , after_download_errors = harvester .after_download (content , harvest_job )
168
+ if not next_page_url :
169
+ return []
171
170
172
- for error_msg in after_download_errors :
173
- self ._save_gather_error (error_msg , harvest_job )
171
+ content , rdf_format = self ._get_content_and_type (next_page_url , harvest_job , 1 , content_type = rdf_format )
174
172
175
- if not content :
176
- return False
173
+ # TODO: store content?
174
+ for harvester in p .PluginImplementations (IDCATRDFHarvester ):
175
+ content , after_download_errors = harvester .after_download (content , harvest_job )
177
176
178
- # TODO: profiles conf
179
- parser = RDFParser ( )
177
+ for error_msg in after_download_errors :
178
+ self . _save_gather_error ( error_msg , harvest_job )
180
179
181
- try :
182
- parser .parse (content , _format = rdf_format )
183
- except RDFParserException , e :
184
- self ._save_gather_error ('Error parsing the RDF file: {0}' .format (e ), harvest_job )
185
- return False
180
+ if not content :
181
+ return []
186
182
187
- guids_in_source = []
188
- object_ids = []
189
- for dataset in parser .datasets ():
190
- if not dataset .get ('name' ):
191
- dataset ['name' ] = self ._gen_new_name (dataset ['title' ])
183
+ # TODO: profiles conf
184
+ parser = RDFParser ()
192
185
193
- # Unless already set by the parser, get the owner organization (if any)
194
- # from the harvest source dataset
195
- if not dataset .get ('owner_org' ):
196
- source_dataset = model .Package .get (harvest_job .source .id )
197
- if source_dataset .owner_org :
198
- dataset ['owner_org' ] = source_dataset .owner_org
186
+ try :
187
+ parser .parse (content , _format = rdf_format )
188
+ except RDFParserException , e :
189
+ self ._save_gather_error ('Error parsing the RDF file: {0}' .format (e ), harvest_job )
190
+ return []
199
191
200
- # Try to get a unique identifier for the harvested dataset
201
- guid = self ._get_guid (dataset )
192
+ for dataset in parser .datasets ():
193
+ if not dataset .get ('name' ):
194
+ dataset ['name' ] = self ._gen_new_name (dataset ['title' ])
202
195
203
- if not guid :
204
- self ._save_gather_error ('Could not get a unique identifier for dataset: {0}' .format (dataset ),
205
- harvest_job )
206
- continue
196
+ # Unless already set by the parser, get the owner organization (if any)
197
+ # from the harvest source dataset
198
+ if not dataset .get ('owner_org' ):
199
+ source_dataset = model .Package .get (harvest_job .source .id )
200
+ if source_dataset .owner_org :
201
+ dataset ['owner_org' ] = source_dataset .owner_org
207
202
208
- dataset [ 'extras' ]. append ({ 'key' : 'guid' , 'value' : guid })
209
- guids_in_source . append ( guid )
203
+ # Try to get a unique identifier for the harvested dataset
204
+ guid = self . _get_guid ( dataset )
210
205
211
- obj = HarvestObject (guid = guid , job = harvest_job ,
212
- content = json .dumps (dataset ))
206
+ if not guid :
207
+ self ._save_gather_error ('Could not get a unique identifier for dataset: {0}' .format (dataset ),
208
+ harvest_job )
209
+ continue
213
210
214
- obj .save ()
215
- object_ids .append (obj .id )
211
+ dataset ['extras' ].append ({'key' : 'guid' , 'value' : guid })
212
+ guids_in_source .append (guid )
213
+
214
+ obj = HarvestObject (guid = guid , job = harvest_job ,
215
+ content = json .dumps (dataset ))
216
+
217
+ obj .save ()
218
+ object_ids .append (obj .id )
219
+
220
+ # get the next page
221
+ next_page_url = parser .next_page ()
216
222
217
223
# Check if some datasets need to be deleted
218
224
object_ids_to_delete = self ._mark_datasets_for_deletion (guids_in_source , harvest_job )
0 commit comments