Skip to content
This repository was archived by the owner on Apr 24, 2025. It is now read-only.

Commit 0ab9e85

Browse files
authored
Merge pull request #1 from ogdch/62-pagination-rdf-harvester
62 pagination rdf harvester
2 parents e5e727e + 1b3a30c commit 0ab9e85

File tree

4 files changed

+134
-54
lines changed

4 files changed

+134
-54
lines changed

ckanext/dcat/harvesters/base.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -96,17 +96,17 @@ def _get_content_and_type(self, url, harvest_job, page=1, content_type=None):
9696
# We want to catch these ones later on
9797
raise
9898

99-
msg = 'Could not get content. Server responded with %s %s' % (
100-
error.response.status_code, error.response.reason)
99+
msg = 'Could not get content from %s. Server responded with %s %s' % (
100+
url, error.response.status_code, error.response.reason)
101101
self._save_gather_error(msg, harvest_job)
102102
return None, None
103103
except requests.exceptions.ConnectionError, error:
104-
msg = '''Could not get content because a
105-
connection error occurred. %s''' % error
104+
msg = '''Could not get content from %s because a
105+
connection error occurred. %s''' % (url, error)
106106
self._save_gather_error(msg, harvest_job)
107107
return None, None
108108
except requests.exceptions.Timeout, error:
109-
msg = 'Could not get content because the connection timed out.'
109+
msg = 'Could not get content from %s because the connection timed out.' % url
110110
self._save_gather_error(msg, harvest_job)
111111
return None, None
112112

ckanext/dcat/harvesters/rdf.py

+55-49
Original file line numberDiff line numberDiff line change
@@ -148,71 +148,77 @@ def gather_stage(self, harvest_job):
148148

149149
log.debug('In DCATRDFHarvester gather_stage')
150150

151-
# Get file contents
152-
url = harvest_job.source.url
151+
rdf_format = None
152+
if harvest_job.source.config:
153+
rdf_format = json.loads(harvest_job.source.config).get("rdf_format")
153154

154-
for harvester in p.PluginImplementations(IDCATRDFHarvester):
155-
url, before_download_errors = harvester.before_download(url, harvest_job)
155+
# Get file contents of first page
156+
next_page_url = harvest_job.source.url
156157

157-
for error_msg in before_download_errors:
158-
self._save_gather_error(error_msg, harvest_job)
158+
guids_in_source = []
159+
object_ids = []
159160

160-
if not url:
161-
return False
161+
while next_page_url:
162+
for harvester in p.PluginImplementations(IDCATRDFHarvester):
163+
next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job)
162164

163-
rdf_format = None
164-
if harvest_job.source.config:
165-
rdf_format = json.loads(harvest_job.source.config).get("rdf_format")
166-
content, rdf_format = self._get_content_and_type(url, harvest_job, 1, content_type=rdf_format)
165+
for error_msg in before_download_errors:
166+
self._save_gather_error(error_msg, harvest_job)
167167

168-
# TODO: store content?
169-
for harvester in p.PluginImplementations(IDCATRDFHarvester):
170-
content, after_download_errors = harvester.after_download(content, harvest_job)
168+
if not next_page_url:
169+
return []
171170

172-
for error_msg in after_download_errors:
173-
self._save_gather_error(error_msg, harvest_job)
171+
content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format)
174172

175-
if not content:
176-
return False
173+
# TODO: store content?
174+
for harvester in p.PluginImplementations(IDCATRDFHarvester):
175+
content, after_download_errors = harvester.after_download(content, harvest_job)
177176

178-
# TODO: profiles conf
179-
parser = RDFParser()
177+
for error_msg in after_download_errors:
178+
self._save_gather_error(error_msg, harvest_job)
180179

181-
try:
182-
parser.parse(content, _format=rdf_format)
183-
except RDFParserException, e:
184-
self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
185-
return False
180+
if not content:
181+
return []
186182

187-
guids_in_source = []
188-
object_ids = []
189-
for dataset in parser.datasets():
190-
if not dataset.get('name'):
191-
dataset['name'] = self._gen_new_name(dataset['title'])
183+
# TODO: profiles conf
184+
parser = RDFParser()
192185

193-
# Unless already set by the parser, get the owner organization (if any)
194-
# from the harvest source dataset
195-
if not dataset.get('owner_org'):
196-
source_dataset = model.Package.get(harvest_job.source.id)
197-
if source_dataset.owner_org:
198-
dataset['owner_org'] = source_dataset.owner_org
186+
try:
187+
parser.parse(content, _format=rdf_format)
188+
except RDFParserException, e:
189+
self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job)
190+
return []
199191

200-
# Try to get a unique identifier for the harvested dataset
201-
guid = self._get_guid(dataset)
192+
for dataset in parser.datasets():
193+
if not dataset.get('name'):
194+
dataset['name'] = self._gen_new_name(dataset['title'])
202195

203-
if not guid:
204-
self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
205-
harvest_job)
206-
continue
196+
# Unless already set by the parser, get the owner organization (if any)
197+
# from the harvest source dataset
198+
if not dataset.get('owner_org'):
199+
source_dataset = model.Package.get(harvest_job.source.id)
200+
if source_dataset.owner_org:
201+
dataset['owner_org'] = source_dataset.owner_org
207202

208-
dataset['extras'].append({'key': 'guid', 'value': guid})
209-
guids_in_source.append(guid)
203+
# Try to get a unique identifier for the harvested dataset
204+
guid = self._get_guid(dataset)
210205

211-
obj = HarvestObject(guid=guid, job=harvest_job,
212-
content=json.dumps(dataset))
206+
if not guid:
207+
self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset),
208+
harvest_job)
209+
continue
213210

214-
obj.save()
215-
object_ids.append(obj.id)
211+
dataset['extras'].append({'key': 'guid', 'value': guid})
212+
guids_in_source.append(guid)
213+
214+
obj = HarvestObject(guid=guid, job=harvest_job,
215+
content=json.dumps(dataset))
216+
217+
obj.save()
218+
object_ids.append(obj.id)
219+
220+
# get the next page
221+
next_page_url = parser.next_page()
216222

217223
# Check if some datasets need to be deleted
218224
object_ids_to_delete = self._mark_datasets_for_deletion(guids_in_source, harvest_job)

ckanext/dcat/processors.py

+10
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ def _datasets(self):
114114
for dataset in self.g.subjects(RDF.type, DCAT.Dataset):
115115
yield dataset
116116

117+
def next_page(self):
118+
'''
119+
Returns the URL of the next page or None if there is no next page
120+
'''
121+
for pagination_node in self.g.subjects(RDF.type, HYDRA.PagedCollection):
122+
for o in self.g.objects(pagination_node, HYDRA.nextPage):
123+
return unicode(o)
124+
return None
125+
126+
117127
def parse(self, data, _format=None):
118128
'''
119129
Parses and RDF graph serialization and into the class graph

ckanext/dcat/tests/test_base_parser.py

+64
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,70 @@ def test_parse_data(self):
139139

140140
eq_(len(p.g), 2)
141141

142+
def test_parse_pagination_next_page(self):
143+
144+
data = '''<?xml version="1.0" encoding="utf-8" ?>
145+
<rdf:RDF
146+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
147+
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
148+
xmlns:hydra="http://www.w3.org/ns/hydra/core#">
149+
<hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=1">
150+
<hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems>
151+
<hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage>
152+
<hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage>
153+
<hydra:nextPage>http://example.com/catalog.xml?page=2</hydra:nextPage>
154+
<hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage>
155+
</hydra:PagedCollection>
156+
</rdf:RDF>
157+
'''
158+
159+
p = RDFParser()
160+
161+
p.parse(data)
162+
163+
eq_(p.next_page(), 'http://example.com/catalog.xml?page=2')
164+
165+
def test_parse_without_pagination(self):
166+
167+
data = '''<?xml version="1.0" encoding="utf-8" ?>
168+
<rdf:RDF
169+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
170+
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#">
171+
<rdfs:SomeClass rdf:about="http://example.org">
172+
<rdfs:label>Some label</rdfs:label>
173+
</rdfs:SomeClass>
174+
</rdf:RDF>
175+
'''
176+
177+
p = RDFParser()
178+
179+
p.parse(data)
180+
181+
eq_(p.next_page(), None)
182+
183+
def test_parse_pagination_last_page(self):
184+
185+
data = '''<?xml version="1.0" encoding="utf-8" ?>
186+
<rdf:RDF
187+
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
188+
xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
189+
xmlns:hydra="http://www.w3.org/ns/hydra/core#">
190+
<hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3">
191+
<hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems>
192+
<hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage>
193+
<hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage>
194+
<hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage>
195+
<hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage>
196+
</hydra:PagedCollection>
197+
</rdf:RDF>
198+
'''
199+
200+
p = RDFParser()
201+
202+
p.parse(data)
203+
204+
eq_(p.next_page(), None)
205+
142206
def test_parse_data_different_format(self):
143207

144208
data = '''

0 commit comments

Comments
 (0)