-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCopyJWMedia.py
322 lines (258 loc) · 13.7 KB
/
CopyJWMedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
starting_year = 2019
study_publication = "rr" ## This needs to be the 2 characer shortcode of the study publication, currently the "Pure Worship" book
language_suffix = "_E" ## This needs to be the language suffix for your area e.g. _E = English, _T = Portuguese
study_publication_full = study_publication + language_suffix
jwlibrary_package = "WatchtowerBibleandTractSo.45909CDBADF3C_5rz59y55nfz3e"
## TODO:
## * Split code into functions rather than one big script
## * Allow user to copy only Watchtower or only Meeting Workbooks using command line switches
## * Filter out all characters that aren't valid in Windows filename- see line 227, currently using replace(). Could possibly use regex for this
import os, calendar, shutil, time, sqlite3, logging, re
from datetime import date, timedelta
def get_filtered_folders(search_string, array):
return list(filter(lambda x: search_string in x, array))
def get_year_month_from_folder(folder_name):
year = folder_name[-6:-2]
month = folder_name[-2:]
return (year, month)
def get_db_connection(dbpath):
conn = sqlite3.connect(dbpath)
conn.row_factory = sqlite3.Row # allows accessing columns using column name - see https://docs.python.org/2/library/sqlite3.html#row-objects
return conn
def get_documents(conn):
c = conn.cursor()
return c.execute("SELECT * FROM Document ORDER BY DocumentId")
def get_documents_by_meps_document_id(conn, documentid):
c = conn.cursor()
return c.execute("SELECT * FROM Document WHERE MepsDocumentId = ?", (str(documentid),) )
def get_document_multimedia(conn, documentid):
c = conn.cursor()
return c.execute("SELECT MultimediaId FROM DocumentMultimedia WHERE DocumentId = ?", (str(documentid),) )
def get_document_multimedia_info(conn, documentid):
c = conn.cursor()
return c.execute("SELECT DocumentMultimedia.MultimediaId, Label, Filepath FROM DocumentMultimedia JOIN Multimedia ON DocumentMultimedia.MultimediaId = Multimedia.MultimediaId WHERE CategoryType = 8 AND DocumentId = ?", (str(documentid),)) #need the extra comma as we pass in a tuple
def get_media_keys(media_conn, issuetag, track):
c = media_conn.cursor()
return c.execute("SELECT * FROM MediaKey WHERE IssueTagNumber = ? AND Track = ?", (issuetag, track))
def get_media_key(media_conn, issuetagnumber, track):
c = media_conn.cursor()
c.execute("SELECT MediaKeyId FROM MediaKey WHERE IssueTagNumber = ? AND Track = ?", (issuetagnumber, track))
row = c.fetchone()
if row is None:
return
return row['MediaKeyId']
def get_first_date_wt(conn):
c = conn.cursor()
c.execute("SELECT FirstDatedTextDateOffset FROM Publication")
row = c.fetchone()
return int2date(row[0])
def get_meps_document_ids(conn, documentid):
c = conn.cursor()
c.execute("select RefMepsDocumentId from Extract inner join RefPublication on Extract.RefPublicationId = RefPublication.RefPublicationId inner join DocumentExtract on Extract.ExtractId = DocumentExtract.ExtractId where DocumentExtract.DocumentId = ? and RefPublication.RootSymbol = ?", (str(documentid), study_publication))
return c
def get_multimedia_tag(conn, multimedia_id):
c = conn.cursor()
c.execute("SELECT KeySymbol, IssueTagNumber, Track FROM Multimedia WHERE MultimediaId = ?", (str(multimedia_id),) )
row = c.fetchone()
return (row['KeySymbol'], row['IssueTagNumber'], row['Track'])
def get_video_details(media_conn, media_key):
c = media_conn.cursor()
c.execute("SELECT Title, Filepath FROM Video WHERE MediaKeyId = ?", (media_key,) )
return c.fetchone()
def copyfile_nooverwrite(source, target):
if not os.path.exists(target):
shutil.copy2(source, target)
def sanitise_filename(filename):
## This function is no longer used as we are using p.sub (regexp) instead. Could possibly change this function to use regexp?
for char in "<>:\"/\\|?*":
filename = filename.replace(char, '')
return filename
def int2date(argdate: int) -> date:
"""
If you have date as an integer, use this method to obtain a datetime.date object.
Parameters
----------
argdate : int
Date as a regular integer value (example: 20160618)
Returns
-------
dateandtime.date
A date object which corresponds to the given value `argdate`.
"""
year = int(argdate / 10000)
month = int((argdate % 10000) / 100)
day = int(argdate % 100)
return date(year, month, day)
def month_to_number(month_name):
month_dict = {
"January": "01",
"February": "02",
"March": "03",
"April": "04",
"May": "05",
"June": "06",
"July": "07",
"August": "08",
"September": "09",
"October": "10",
"November": "11",
"December": "12"
}
return month_dict[month_name]
## Main program ##
print("Copying images from JW Library Meeting Workbooks to Soundbox...")
meeting_parts = {
'10': '3', # Living as Christians
'21': '1', # Treasures from God's Word
'107': '3' # Living as Christians
}
start = time.time()
## Compile regex
p = re.compile(r'[^\w\s,.]') ## matches all non-alphanumeric characters except space, comma and full-stop
## setup the logger
logging.basicConfig(filename='JWMediaCopier.log', level=logging.WARNING, format='%(asctime)s %(levelname)s %(message)s', filemode='w') ##overwrites the log each time
## Open Media Catalog
media_catalog_path = os.path.join(os.getenv("LOCALAPPDATA"), "packages", jwlibrary_package, "LocalState", "Data", "mediaCollection.db")
media_conn = get_db_connection(media_catalog_path)
# By using os.path.join() instead of backslashes we make this script cross-platform compatible. You know for when we get JWlibrary and Soundbox for Linux and Mac! ;) teehee...
targetpath_base = os.path.join(os.getenv("ProgramData"), "SoundBox", "MediaCalendar")
path = os.path.join(os.getenv("LOCALAPPDATA"), "packages", jwlibrary_package, "LocalState", "Publications")
## Open Congregation Bible Study catalog
book_study_path = os.path.join(path, study_publication_full)
book_study_database = os.path.join(book_study_path, study_publication_full + ".db")
book_study_conn = get_db_connection(book_study_database)
array = os.listdir(path)
print("\r\nMeeting Workbooks:")
filtered_folders = get_filtered_folders("mwb"+language_suffix+"_", array)
for source_folder in filtered_folders:
year, month = get_year_month_from_folder(source_folder)
## Ignore years before 2017
if int(year) < starting_year:
continue
print("Copying " + source_folder)
source_path = os.path.join(path, source_folder)
dbpath = os.path.join(source_path, source_folder+".db")
conn = get_db_connection(dbpath)
c = conn.cursor()
for row in c.execute("SELECT * FROM Document ORDER BY DocumentId"):
row_class = row['Class']
if row_class == '106':
# This is a new week
documentid = row['DocumentId']
title = row['Title']
# week = re.sub("[^0-9-–]", "", row['Title']) # Remove non-numeric characters (excluding the hyphen) from the string. E.g. to change "January 4-11" or "4-11 de Janeiro" to "4-11"
week = title
split_week = week.split('-') #splits into 'from' and 'to' sections
from_date = split_week[0]
split_from_date = from_date.split(" ")
from_date_month = split_from_date[0]
from_month_number = month_to_number(from_date_month)
from_date_day = split_from_date[1]
# New month
if len(split_week) == 1:
## Need to use long hyphen instead
split_week = week.split('–') #splits into 'from' and 'to' sections
to_date = split_week[1]
split_to_date = to_date.split(" ")
to_date_month = split_to_date[0]
to_month_number = month_to_number(to_date_month)
to_date_day = split_to_date[1]
else:
to_month_number = from_month_number
to_date_day = split_week[1]
target_folder = year + "-" + from_month_number + "-" + str(from_date_day).zfill(2)
targetpath = os.path.join(targetpath_base, year, target_folder)
print("Writing files to", targetpath)
if not os.path.exists(targetpath):
os.makedirs(targetpath)
## Get the Congregation Bible Study images
counter=0
document_ids = get_meps_document_ids(conn, documentid)
for row in document_ids:
## Get jy.db -> Document -> DocumentMultimedia -> Multimedia etc
for book_study_doc in get_documents_by_meps_document_id(book_study_conn, row['RefMepsDocumentId']):
for media in get_document_multimedia_info(book_study_conn, book_study_doc["DocumentId"]):
counter += 10
book_study_source_file = media['Filepath']
target_file_name = "M3-" + str(counter).zfill(3) + " " + p.sub('', media['Label']) ## p.sub strips non-alpha
target_file_path = os.path.join(targetpath, target_file_name)[:255] + ".jpg"
if not os.path.exists(target_file_path):
source_file_path = os.path.join(book_study_path, book_study_source_file)
shutil.copyfile(source_file_path, target_file_path)
## Get the Videos!!!
document_multimedia_records = get_document_multimedia(conn, documentid)
for document_multimedia in document_multimedia_records:
multimedia_id = document_multimedia['MultimediaId']
keysymbol, issuetag, track = get_multimedia_tag(conn, multimedia_id)
if keysymbol == 'nwtsv' or issuetag > 0:
media_key = get_media_key(media_conn, issuetag, track)
row = get_video_details(media_conn, media_key)
if row:
source_file_path = row['Filepath']
title = row['Title']
## valid_file_name = sanitise_filename(title)
valid_file_name = p.sub('', title) ## p.sub removes non-alpha
meeting_part = '1'
counter += 10
target_file_name = "M" + meeting_part + "-" + str(counter).zfill(3) + " " + valid_file_name + ".mp4"
target_file_path = os.path.join(targetpath, target_file_name)
if os.path.exists(source_file_path):
if not os.path.exists(target_file_path):
shutil.copyfile(source_file_path, target_file_path)
else:
warning_message = "File " + source_file_path + " was not found - skipped"
logging.warning(warning_message)
print("Warning: " + warning_message)
if row_class in ['21','107','10']:
# Treasures from God's word or Living as Christians
document_id = row['DocumentId']
# Get all of the multimedia records for this document
d = conn.cursor()
t_doc = (document_id, )
counter = 0
for row2 in d.execute("SELECT DocumentMultimedia.MultimediaId, Label, Filepath FROM DocumentMultimedia JOIN Multimedia ON DocumentMultimedia.MultimediaId = Multimedia.MultimediaId WHERE DocumentId = ? AND CategoryType = 8", t_doc):
counter += 10
sourcefile = row2['Filepath']
meeting_part = meeting_parts[row_class]
target_file_name = "M" + meeting_part + "-" + str(counter).zfill(3) + " " + p.sub('', row2['Label'])
target_file_path = os.path.join(targetpath, target_file_name)[:255] + ".jpg"
if not os.path.exists(target_file_path):
source_file_path = os.path.join(source_path, sourcefile)
shutil.copyfile(source_file_path, target_file_path)
conn.close()
## WatchTower
print("\r\nWatchtowers:")
filtered_folders = get_filtered_folders("w"+language_suffix+"_", array)
for source_folder in filtered_folders:
dbpath = os.path.join(path, source_folder, source_folder+".db")
conn = get_db_connection(dbpath)
study_date = get_first_date_wt(conn)
study_date += timedelta(days=6) # Change w/c date from Monday to Sunday
## Create target folder
year = str(study_date.year)
## Ignore years before 2017
if int(year) < starting_year:
continue
print("Copying "+source_folder)
for document in get_documents(conn):
document_class = document['Class']
if document_class == '40':
folder_name = str(study_date)
targetpath = os.path.join(targetpath_base, year, folder_name)
print("Writing files to", targetpath)
if not os.path.exists(targetpath):
os.makedirs(targetpath)
counter = 0
images = get_document_multimedia_info(conn, document['DocumentId'])
for image in images:
counter += 10
source_file_path = os.path.join(path, source_folder, image['Filepath'])
target_file_name = "W2-" + str(counter).zfill(3) + " " + p.sub('',image['Label']) ## p.sub strips non-alpha
target_file_path = os.path.join(targetpath, target_file_name)[:255] + ".jpg"
copyfile_nooverwrite(source_file_path, target_file_path)
study_date += timedelta(days=7) #Increment the week
conn.close()
media_conn.close()
elapsed = str(time.time() - start)
print("\r\nFinished")
print("Time taken: " + elapsed + " seconds")