-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathmapper.py
1727 lines (1357 loc) · 76.5 KB
/
mapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# -*- coding: utf-8 -*-
'''
#########
Mapper
#########
* This module contains all the core methods/functions that are used for RDF-triple generation.
* This module mainly consists of the mapping functions that are used by the extractors based on \
rules present in ``setting.json``, ``mapping_rules.py`` and ``custom_mappers.json``.
* This module also contains the extractors and some other helper functions that are used by the mapping \
functions to generate triples.
'''
import urllib2
import json
import re
import rdflib
import utilities
import sys
import time
from mapping_rules import *
#defining namespaces to be used in the extracted triples
dbo = rdflib.Namespace("http://dbpedia.org/ontology/")
dbr = rdflib.Namespace("http://dbpedia.org/resource/")
rdf = rdflib.Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
mapped_domains = [] # used to prevent duplicate mappings
resource_class = ""
# These would contain the mapping rules and the custom defined mapping functions that would be used by the
# mapper functions. These are initially empty and loaded when mapper functions are selected for each resource.
MAPPING = dict()
CUSTOM_MAPPERS = dict()
def select_mapping(resDict, res, lang, res_class, g):
''' Calls mapping functions for each matching section of the resource, thus constructing the associated RDF graph.
Firstly selects the mapping type(s) to apply from ``MAPPING`` (loaded from ``settings.json``) based on resource class (domain).
If a match is found, it tries to find another match between section names and key-words related to that domain.
Finally, it applies related mapping functions for the list elements contained in that section.
:param resDict: dictionary representing current resource.
:param res: current resource name.
:param res_class: resource class/type (e.g. ``Writer``).
:param lang: resource language.
:param g: RDF graph to be created.
:return: number of list elements actually mapped in the graph.
'''
#use globally defined dicts
global mapped_domains
global resource_class
global MAPPING
global CUSTOM_MAPPERS
if len(MAPPING) == 0: #load initial configuration
MAPPING = utilities.load_settings()
CUSTOM_MAPPERS = utilities.load_custom_mappers()
# initialize the number of triples extracted
res_elems = 0
#if required class is a valid and existing class in the mapping, run suitable mapper functions
if res_class in MAPPING and MAPPING[res_class] not in mapped_domains:
if lang != 'en': # correct dbpedia resource domain for non-english language
global dbr
dbr = rdflib.Namespace("http://" + lang + ".dbpedia.org/resource/")
db_res = rdflib.URIRef(dbr + res.decode('utf-8'))
domains = MAPPING[res_class] # e.g. ['BIBLIOGRAPHY', 'FILMOGRAPHY']
domain_keys = []
resource_class = res_class
for domain in domains:
if domain in mapped_domains:
continue
is_custom_map_fn = False
try:
if lang in eval(domain):
domain_keys = eval(domain)[lang] # e.g. ['bibliography', 'works', ..]
else:
print("The language provided is not available yet for this mapping!")
sys.exit(1)
except NameError: #key not found(predefined mappers)
if domain not in CUSTOM_MAPPERS.keys():
print "Cannot find the domain's mapper function!!"
print 'You can add a mapper function for this mapping using rulesGenerator.py and try again...\n'
sys.exit(1)
else:
is_custom_map_fn = True
domain_keys = CUSTOM_MAPPERS[domain]["headers"][lang]
mapped_domains.append(domain) #this domain won't be used again for mapping
for res_key in resDict.keys(): # iterate on resource dictionary keys
mapped = False
for dk in domain_keys: # search for resource keys related to the selected domain
# if the section hasn't been mapped yet and the title match, apply domain related mapping
dk = dk.decode('utf-8') #make sure utf-8 mismatches don't skip sections
if not mapped and re.search(dk, res_key, re.IGNORECASE):
try:
if is_custom_map_fn == False:
#use the pre-defined mapper functions
mapper = "map_" + domain.lower() + "(resDict[res_key], res_key, db_res, lang, g, 0)"
res_elems += eval(mapper) # calls the proper mapping for that domain and counts extracted elements
mapped = True # prevents the same section to be mapped again
else:
mapper = map_user_defined_mappings(domain, resDict[res_key], res_key, db_res, lang, g, 0)
res_elems += mapper # calls the proper mapping for that domain and counts extracted elements
mapped = True # prevents the same section to be mapped again
except:
print 'exception occured in resDict, skipping....'
else:
# print 'This domain has not been mapped yet!'
# print 'You can add a mapping for this domain using rulesGenerator.py and try again...\n'
return 0
return res_elems
def map_user_defined_mappings(mapper_fn_name, elem_list, sect_name, res, lang, g, elems):
''' **This is the made module that runs all user-defined mapper functions.**
* It uses the ``CUSTOM_MAPPERS`` dict to find the settings assotiated with the domain, and then runs \
the mapping functions according to the settings and adds the associated triples in the RDF graph.
* Firstly selects the settings to apply from ``custom_mappers.json`` based on resource class (domain).
* If a match is found, it tries to find another match between section names and key-words related to that domain.
* Finally, it applies related mapping functions for the list elements contained in that section.
:param mapper_fn_name: Name of the custom mapper function from which settings(eg. section headers,
ontology classes, extractors to be used etc.) are loaded.
:param resDict: dictionary representing current resource.
:param res: current resource name.
:param res_class: resource class/type (e.g. ``Writer``).
:param lang: resource language.
:param g: RDF graph to be created.
:return: number of list elements actually mapped in the graph.
'''
global CUSTOM_MAPPERS
#determine if the custom mapper function exists; if yes, load it.
mapper_settings = dict()
try:
mapper_settings = CUSTOM_MAPPERS[mapper_fn_name]
except: #not found, skip the rest of the process
print 'Did not find', mapper_fn_name, 'in CUSTOM_MAPPERS!'
return 0
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_user_defined_mappings(mapper_fn_name, elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
elem = elem.encode('utf-8') # apply utf-8 encoding
years = []
if mapper_settings["years"] == "Yes": #extract years related to the resource.
years = month_year_mapper(elem)
#load the ontologies to be used in the triple generation
ontology_class = None
for class_type in mapper_settings["ontology"][lang]:
try:
#find a matching sub-section from the ontology class
if class_type.decode('utf-8').lower() in sect_name.decode('utf-8').lower():
ontology_class = class_type
except UnicodeEncodeError:
break
if ontology_class == None: #No possible mapping found; try default mapping
if mapper_settings["ontology"][lang]["default"] == "None":
return 0 #default wasn't allowed
else:
ontology_class = "default"
# print 'Matching Header not found, using default ontology relation:', str(ontology_class)
#final ontology class/property for the current element
p = mapper_settings["ontology"][lang][ontology_class]
#selection of the extractors to be used in the triple generation
extractor_choices = mapper_settings["extractors"]
#initital uri and resource names of the triple (triple form: <uri dbo:p res_name>)
uri = None
res_name = None
if res_name == None and 1 in extractor_choices: #italics mapper was chosen
res_name = italic_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name) #quoting res_name in proper format
uri = dbr + res_name.decode('utf-8', errors='ignore')
if res_name == None and 2 in extractor_choices: #reference mapper was chosen
res_name = reference_mapper(elem)
if res_name: # current element contains a reference
uri = wikidataAPI_call(res_name, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
res_name = list_elem_clean(res_name)
elem = elem.replace(res_name, "") #subtract reference part from list element, to facilitate further parsing
uri_name = res_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name) #quoting res_name in proper format
uri = dbr + uri_name.decode('utf-8', errors='ignore')
if res_name == None and 3 in extractor_choices: #quote mapper was chosen
res_name = quote_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name) #quoting res_name in proper format
uri = dbr + res_name.decode('utf-8', errors='ignore')
if res_name == None and 4 in extractor_choices: #general mapper was chosen
res_name = general_mapper(elem)
if (res_name and res_name != "" and res_name != res):
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name) #quoting res_name in proper format
uri = dbr + res_name.decode('utf-8', errors='ignore')
#if successfully found a triple, add that to the existing graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo[p], res))
elems += 1
if years:
add_years_to_graph(g, uri, years)
if elems == 0: print 'Could not extract any elements. Try adding more extractors....'
return elems
def map_discography(elem_list, sect_name, res, lang, g, elems):
''' Handles albums list present inside a section containing a match with ``DISCOGRAPHY``.
Adds RDF statement about the album, its artist (the current resource) and the year it was launched.
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_discography(elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
year = month_year_mapper(elem) #map years present in the list
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name == None: res_name = quote_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name) ###
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), rdf.type, dbo.Album))
g.add((rdflib.URIRef(uri), dbo.musicalArtist, res))
elems += 1
if year:
add_years_to_graph(g, uri, year, {'activeYear':'releaseYear'})
return elems
def map_concert_tours(elem_list, sect_name, res, lang, g, elems):
''' Handles lists of concerts present inside a section containing a match with ``CONCERT_TOURS``.
Adds RDF statement about the tour, its artist (the current resource) and the year it was launched.
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_concert_tours(elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
year = month_year_mapper(elem)
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name == None: res_name = quote_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), rdf.type, dbo.concertTour))
g.add((rdflib.URIRef(uri), dbo.musicalArtist, res))
elems += 1
if year:
add_years_to_graph(g, uri, year)
return elems
def map_alumni(elem_list, sect_name, res, lang, g, elems):
''' Handles lists of alumni members present inside a section containing a match with ``ALUMNI``.
Adds RDF statement about the person and its assosiation with the resource(organisation).
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_alumni(elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo.alumni, res))
elems += 1
work = alumni_profession_mapper(elem)
if work:
g.add((rdflib.URIRef(uri), dbo.notableWork, rdflib.Literal(work, datatype=rdflib.XSD.string)))
return elems
def map_programs_offered(elem_list, sect_name, res, lang, g, elems):
''' Handles list present inside a section containing a match with ``PROGRAMS_OFFERED``.
Adds RDF statement about the programs offered by the resource(organisation).
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_programs_offered(elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo.academicDiscipline, res))
elems+=1
return elems
def map_honors(elem_list, sect_name, res, lang, g, elems):
''' Handles lists related to awards and honors given to people inside a section containing \
a match with ``HONORS``.
Adds RDF statements about the awards, and its details(if present) and the recipient.
:param elem_list: list of elements to be mapped.
:param sect_name: section name, used to reconcile literary genre.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
award_status = award_status_mapper(sect_name, lang) # if award status is found in the section name.
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_honors(elem, sect_name, res, lang, g, elems)
else:
uri = None
#find out the status of the award; i.e winner or nominated
if award_status == None: award_status = award_status_mapper(elem, lang)
if award_status == None: award_status = "Winner" #if no information is found, assume winner.
elem = elem.encode('utf-8') # apply utf-8 encoding
#remove status from the element
elem = elem.replace("Winner","").replace("Won","").replace("Nominated","").replace("Nominee","")
#find out the resource for which the award was given
for_entity = sentence_splitter(elem,"for",lang)
#the entity providing the award
from_entity = sentence_splitter(elem,"from",lang)
year = month_year_mapper(elem)
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else:
uri_name = quote_mapper(elem) #try finding awards in quotes
if uri_name == None: uri_name = general_mapper(elem) # no reference found, try general mapping (less accurate
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo.awardedTo, res))
g.add((rdflib.URIRef(uri), dbo.awardStatus, dbo + rdflib.URIRef(award_status)))
if year:
add_years_to_graph(g, uri, year)
if for_entity:
g.add((rdflib.URIRef(uri), dbo.AwardedFor, dbr + rdflib.URIRef(for_entity)))
if from_entity:
g.add((dbo + rdflib.URIRef(award_status), dbo.AwardedBy, dbr + rdflib.URIRef(from_entity)))
elems += 1
return elems
def map_staff(elem_list, sect_name, res, lang, g, elems):
''' Handles list present inside a section containing a match with ``STAFF``.
Adds RDF statement about the staff members and the institution.
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_staff(elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name) ###
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
if len(list(g.triples((rdflib.URIRef(uri), dbo.alumni, res)))) == 0 and \
len(list(g.triples((rdflib.URIRef(uri), dbo.academicDiscipline, res)))) ==0: # if already mapped
g.add((rdflib.URIRef(uri), dbo.staff, res))
return elems
def map_other_person_details(elem_list, sect_name, res, lang, g, elems):
''' Handles list present inside a section containing a match with ``OTHER``.
Adds RDF statement about the unclassified sections in ``Person`` domain.
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_other_person_details(elem, sect_name, res, lang, g, elems) # handle recursive lists
else:
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
other_details = None
for other_type in OTHER_PERSON_DETAILS[lang]:
if other_type.decode('utf-8').lower() in sect_name.decode('utf-8').lower():
other_details = other_type
if other_details == None: #No possible mapping found; leave the element
return 0
p = PERSON_DETAILS[lang][other_details]
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
uri_name = quote_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
ref = None
if uri == None: ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref, "") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name) ###
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name) ###
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo[p], res))
elems+=1
return elems
def map_career(elem_list, sect_name, res, lang, g, elems):
''' Handles lists related to awards and honors given to people inside a section containing a \
match with ``CAREER``.
Adds RDF statements about the career (academic/professional), and its details(if present) \
and the recipient.
:param elem_list: list of elements to be mapped.
:param sect_name: section name, used to reconcile literary genre.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_career(elem, sect_name, res, lang, g, elems)
else:
year = month_year_mapper(elem)
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
other_details = None
for other_type in CAREER[lang]:
#print other_type
if other_type.encode('utf-8').lower() in sect_name.encode('utf-8').lower():
other_details = other_type
if other_details == None: #No possible mapping found; leave the element
return 0
#ontology property/class to be used for this resource
p = PERSON_DETAILS[lang][other_details]
uri_name = quote_mapper(elem)
if uri_name == None or uri_name == res: uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name) ###
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo[p], res))
elems += 1
if year:
add_years_to_graph(g,uri, year)
return elems
def map_filmography(elem_list, sect_name, res, lang, g, elems):
'''Handles lists related to filmography inside a section containing a match with ``FILMOGRAPHY``.
It constructs RDF statements about the movie title, it release year and type (``Film``, ``TV show``, ``Cartoon``..)
and which part the current resource took in it (``director``, ``actor``, ...)
:param elem_list: list of elements to be mapped.
:param sect_name: section name, used to reconcile literary genre.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
film_particip = filmpart_mapper(sect_name, lang) # applied to every list element of the section, default:starring
filmography_type = filmtype_mapper(sect_name, lang) #same as above
for elem in elem_list:
if type(elem) == list: #for nested lists (recursively call this function)
elems += 1
map_filmography(elem, sect_name, res, lang, g, elems)
else:
year = month_year_mapper(elem)
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem) # Try to extract italic formatted text (more precise)
if res_name:
elem = elem.replace(res_name, "") #delete occurence of matched text for further extraction
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else: #if unsuccessful, apply general mapping (lower accuracy)
uri_name = quote_mapper(elem) #try finding names in quotes
if uri_name == None: uri_name = general_mapper(elem) # no reference found, try general mapping (less accurate
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), rdf.type, dbo + rdflib.URIRef(filmography_type)))
if year:
add_years_to_graph(g, uri, year, {'activeYear':'releaseYear'})
if film_particip:
g.add((rdflib.URIRef(uri), dbo + rdflib.URIRef(film_particip), res))
elems += 1
return elems
def map_bibliography(elem_list, sect_name, res, lang, g, elems):
''' Handles lists related to bibliography inside a section containing a match with ``BIBLIOGRAPHY``.
Adds RDF statement about the work title, its author (the current resource), publication year and ISBN code.
:param elem_list: list of elements to be mapped.
:param sect_name: section name, used to reconcile literary genre.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
# literary genre depends on the name of the section, so it is the same for every element of the list
lit_genre = litgenre_mapper(sect_name, lang) #literary genre is the same for every element of the list
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_bibliography(elem, sect_name, res, lang, g, elems)
else:
uri = None
year = month_year_mapper(elem)
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref,"") #subtract reference part from list element, to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else:
uri_name = quote_mapper(elem) #try finding awards in quotes
if uri_name == None or uri_name == res: uri_name = general_mapper(elem) # no reference found, try general mapping (less accurate
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo.author, res))
elems += 1
isbn = isbn_mapper(elem)
if isbn:
g.add((rdflib.URIRef(uri), dbo.isbn, rdflib.Literal(isbn, datatype=rdflib.XSD.string)))
elem = elem.replace(isbn, "")
if year:
add_years_to_graph(g, uri, year)
if lit_genre:
g.add((rdflib.URIRef(uri), dbo.literaryGenre, dbo + rdflib.URIRef(lit_genre)))
return elems
def map_band_members(elem_list, sect_name, res, lang, g, elems):
''' Handles lists related to members inside a section containing a match with ``BAND_MEMBERS``.
Adds RDF statement about the band and its members.
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_members(elem, sect_name, res, lang, g, elems)
else:
uri = None
elem = elem.encode('utf-8') # apply utf-8 encoding
res_name = italic_mapper(elem)
if res_name:
elem = elem.replace(res_name, "") #delete resource name found from element for further mapping
res_name = res_name.replace(' ', '_')
res_name = urllib2.quote(res_name)
uri = dbr + res_name.decode('utf-8', errors='ignore')
else:
ref = reference_mapper(elem) # look for resource references
if ref: # current element contains a reference
uri = wikidataAPI_call(ref, lang) #try to reconcile resource with Wikidata API
if uri:
dbpedia_uri = find_DBpedia_uri(uri, lang) # try to find equivalent DBpedia resource
if dbpedia_uri: # if you can find a DBpedia res, use it as the statement subject
uri = dbpedia_uri
else: # Take the reference name anyway if you can't reconcile it
ref = list_elem_clean(ref)
elem = elem.replace(ref,"") #subtract reference part from list element to facilitate further parsing
uri_name = ref.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
else: # no reference found, try general mapping (less accurate)
uri_name = general_mapper(elem)
if (uri_name and uri_name != "" and uri_name != res):
uri_name = uri_name.replace(' ', '_')
uri_name = urllib2.quote(uri_name)
uri = dbr + uri_name.decode('utf-8', errors='ignore')
#add successfuly extracted triples into the graph
if uri and uri != "":
g.add((rdflib.URIRef(uri), dbo.bandMember, res))
elems += 1
return elems
def map_contributors(elem_list, sect_name, res, lang, g, elems):
''' Handles lists related to contributions made by a person inside a section containing a \
match with ``CONTRIBUTORS``.
Adds RDF statement about the person and his contributions.
:param elem_list: list of elements to be mapped.
:param sect_name: section name.
:param res: current resource.
:param lang: resource language.
:param g: RDF graph to be constructed.
:param elems: a counter to keep track of the number of list elements extracted.
:return: number of list elements extracted.
'''
for elem in elem_list:
if type(elem) == list: # for nested lists (recursively call this function)
elems += 1
map_contributors(elem, sect_name, res, lang, g, elems)
else:
contrib_type, subsection = None, None
search_str = sect_name
#Find if the contribution has a predefined type(editor, writer etc.)
try: subsection = sect_name.split('-')[1].strip()
except: pass