forked from ptsefton/omeka-python-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcsv2omeka.py
227 lines (167 loc) · 8.95 KB
/
csv2omeka.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
from omekaclient import OmekaClient
from omekautils import get_omeka_config
from omekautils import create_stream_logger
from sys import stdout, stdin
import argparse
import json
import httplib2
import urlparse
import os
import json
import csv
import shelve
import copy
from pcdmlite.pcdmlite import Item
from csv2pcdmlite.csv2pcdmlite import CSVData
""" Uploads a csv file to an Omeka server as a series of items and/or collections, one per line """
def download_and_upload_files(item):
"""Handle any dowloads, cache as files locally, then upload all files"""
http = httplib2.Http()
download_this = True
files = []
for url_field in item.URLs:
url = url_field.value
filename = urlparse.urlsplit(url).path.split("/")[-1]
new_path = os.path.join(data_dir, str(item.id))
if not os.path.exists(new_path):
os.makedirs(new_path)
file_path = os.path.join(new_path, filename)
logger.info("Local filename: %s", file_path)
#Check if we have one the same size already
if os.path.exists(file_path):
response, content = http.request(url, "HEAD")
download_size = int(response['content-length']) if 'content-length' in response else -1
file_size = os.path.getsize(file_path)
if download_size == file_size:
logger.info("Already have a download of the same size: %d", file_size)
download_this = False
if download_this:
try:
response, content = http.request(url, "GET")
open(file_path,'wb').write(content)
logger.info(response)
except:
logger.warning("Some kind of download error happened fetching %s - pressing on" % url)
files.append(file_path)
for f in item.files:
files.append(os.path.join(data_dir, f.value))
for fyle in files:
logger.info("Uploading %s", fyle)
try:
omeka_client.post_file_from_filename(fyle, item.omeka_id )
logger.info("Uploaded %s", fyle)
except:
logger.warning("Some kind of error happened uploading %s - pressing on" % fyle)
def load(shelf):
csv_data = CSVData(inputfile)
csv_data.get_items()
for collection in csv_data.collections:
id = collection.id
title = collection.title
if id != None:
collection_id = omeka_client.get_collection_id_by_dc_identifier(id, name=title, create=args['create_collections'], public=args["public"])
uploaded_item_ids = []
total_rows = len(csv_data.items)
row_num = 0
for item in csv_data.items:
print "Processing %s/%s" % (row_num, total_rows)
row_num += 1
id = item.id
if id != None:
if item.title == None or item.title == "":
item.title == "Untitled %s" % str(id)
if item.in_collection == None and args["in_collection"] != None:
item.in_collection = args["in_collection"]
#TODO - make this a parameter with a default
previous_item = get_old_item(shelf, id)
logger.info("Processing item with Dublin Core ID %s", id)
omekaize(item)
jsonstr = json.dumps(item.omeka_data)
# Reupload
if previous_item != None and previous_item.id != None:
previous_id = previous_item.omeka_id
logger.info("Re-uploading %s", previous_id)
response, content = omeka_client.put("items" , previous_id, jsonstr)
new_item = json.loads(content)
#Looks like the ID wasn't actually there, so get it to mint a new one
if response['status'] == '404':
logger.info("retrying")
response, content = omeka_client.post("items", jsonstr)
else: #Or upload new one
logger.info("Uploading new item")
response, content = omeka_client.post("items", jsonstr)
# Should have new (or old) item now
new_item = json.loads(content)
if 'id' in new_item:
item.omeka_id = str(new_item['id'])
save_item(shelf, item, item.id)
download_and_upload_files(item)
if args['relate']:
# Relate to other items
for r in item.relations:
property_id = omeka_client.getRelationPropertyIdByLocalPart(r.namespace.prefix, r.field_name)
object_item = get_old_item(shelf, r.value)
object_id = object_item.omeka_id if object_item != None else None
if object_id != None and property_id != None:
logger.info("Relating this item %s to another. Property %s, target %s", item.omeka_id, property_id, object_id)
omeka_client.addItemRelation(item.omeka_id, property_id, object_id)
else:
logger.error("Uploading failed for item %s %s" % (id, content))
def omekaize(item):
dc_set_id = omeka_client.getSetId("Dublin Core",
create=args['create_elements'] )
type = item.type if item.type != None else args["item_type"]
item_type_id = omeka_client.getItemTypeId(type, create=args['create_item_types'])
item.omeka_data = {"public": args["public"],
"item_type" : {"id": item_type_id }}
if item.in_collection != None:
collection_id = omeka_client.get_collection_id_by_dc_identifier(item.in_collection,
name=item.in_collection,
create=args['create_collections'],
public=args["public"])
if collection_id != None:
item.omeka_data["collection"] = {"id": collection_id}
#Lets deal with DC fields to start with and worry about other namespaces later
element_texts = []
for f in item.text_fields:
if f.namespace.prefix == "dcterms":
element_id = omeka_client.getElementId(dc_set_id ,f.field_name, create=args['create_elements'] )
element_text = {"html": False, "text": unicode(f.value)}
element_text["element"] = {"id": element_id }
element_texts.append(element_text)
item.omeka_data["element_texts"] = element_texts
def get_old_item(shelf, id):
return shelf[id] if id in shelf else omekaize(Item())
def save_item(shelf, item, id):
new_item = copy.deepcopy(item)
shelf[id] = new_item
logger = create_stream_logger('csv2omeka', stdout)
# Define and parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('inputfile', type=argparse.FileType('rb'), default=stdin, help='Name of input Excel file')
parser.add_argument('-k', '--key', default=None, help='Omeka API Key')
parser.add_argument('-u', '--api_url',default=None, help='Omeka API Endpoint URL (hint, ends in /api)')
parser.add_argument('-d', '--download_cache', default="./data", help='Path to a directory in which to chache dowloads (defaults to ./data)')
parser.add_argument('-p', '--public', action='store_true', help='Make items public')
parser.add_argument('-c', '--create_collections', action='store_true', help='Auto-create missing collections')
parser.add_argument('-e', '--create_elements', action='store_true', help='Auto-create missing element types')
parser.add_argument('-y', '--create_item_types', action='store_true', help='Auto-create missing Item Types')
parser.add_argument('-q', '--quietly', action='store_true', help='Only log errors and warnings not the constant stream of info')
parser.add_argument('-r', '--relate', action='store_true', help='Relate items to each other')
parser.add_argument('-i', '--item_type', default="Text", help='Item type to use if there is no dcterms:type in the input row (Defaults to Text).')
parser.add_argument('-n', '--in_collection', default="None", help='Collection to use if there is no dcterms:type in the input row (Defaults to None).')
args = vars(parser.parse_args())
if not args['api_url'] or not args['key']:
config = get_omeka_config()
endpoint = args['api_url'] if args['api_url'] else config['api_url']
apikey = args['key'] if args['api_url'] else config['key']
omeka_client = OmekaClient(endpoint.encode("utf-8"), logger, apikey)
inputfile = args['inputfile']
data_dir = args['download_cache']
# Because we can't efficiently query omeka via API, need to cache data about
# what gets uploaded and what ID it gets
shelf_file = "%s_item_cache" % endpoint.replace("/","_").replace(":",".")
shelf = shelve.open(shelf_file)
if args["quietly"]:
logger.setLevel(30)
load(shelf)