4
4
import os
5
5
import importlib
6
6
import inspect
7
+ import argparse
7
8
8
9
from sqlalchemy import (
9
10
create_engine ,
19
20
make_transient
20
21
)
21
22
22
- from .. .db import Base , LaTeXMLBase , session_factory , _classic_engine as classic_engine
23
- from .. .db .models import (
23
+ from arxiv .db import Base , LaTeXMLBase , session_factory , _classic_engine as classic_engine
24
+ from arxiv .db .models import (
24
25
DBLaTeXMLDocuments ,
25
26
DBLaTeXMLSubmissions ,
26
27
TapirUser ,
@@ -43,6 +44,7 @@ class Edge:
43
44
to_table : str
44
45
to_column : str
45
46
47
+
46
48
47
49
def generate_relationship_graph (models : List [Type ]):
48
50
adjacency_list = {}
@@ -142,13 +144,15 @@ def _write_subquery (table: Any, subq: Subquery, classic_session: Session, new_s
142
144
new_session .commit ()
143
145
new_session .commit ()
144
146
147
+
145
148
def _insert_latexml_tables (query_map : Dict [str , Subquery ], classic_session : Session , new_session : Session ):
146
149
documents = classic_session .execute (select (query_map ['arXiv_metadata' ])).all ()
147
150
ids = [(x [2 ], x [- 4 ]) for x in documents ]
148
- for i in range (0 , len (ids ), 500 ):
151
+ n_docs = 10000 # This was 500, and not sure of this magic number.
152
+ for i in range (0 , len (ids ), n_docs ):
149
153
latexml_docs = classic_session .execute (
150
154
select (DBLaTeXMLDocuments )
151
- .filter (tuple_ (DBLaTeXMLDocuments .paper_id , DBLaTeXMLDocuments .document_version ).in_ (ids [i : min (len (ids ), i + 500 )]))
155
+ .filter (tuple_ (DBLaTeXMLDocuments .paper_id , DBLaTeXMLDocuments .document_version ).in_ (ids [i : min (len (ids ), i + n_docs )]))
152
156
).scalars ().all ()
153
157
for row in latexml_docs :
154
158
make_transient (row )
@@ -157,10 +161,10 @@ def _insert_latexml_tables (query_map: Dict[str, Subquery], classic_session: Ses
157
161
158
162
submissions = classic_session .execute (select (query_map ['arXiv_submissions' ])).all ()
159
163
sub_ids = [x [0 ] for x in submissions ]
160
- for i in range (0 , len (sub_ids ), 500 ):
164
+ for i in range (0 , len (sub_ids ), n_docs ):
161
165
latexml_subs = classic_session .execute (
162
166
select (DBLaTeXMLSubmissions )
163
- .filter (DBLaTeXMLSubmissions .submission_id .in_ (sub_ids [i : min (len (sub_ids ), i + 500 )]))
167
+ .filter (DBLaTeXMLSubmissions .submission_id .in_ (sub_ids [i : min (len (sub_ids ), i + n_docs )]))
164
168
).scalars ().all ()
165
169
for row in latexml_subs :
166
170
make_transient (row )
@@ -182,9 +186,13 @@ def _invert_db_graph_edges (db_graph: Dict[str, List[Edge]]) -> Dict[str, List[E
182
186
inverted_db_graph [next .to_table ] = [reversed_edge ]
183
187
return inverted_db_graph
184
188
185
- def _make_subset (db_graph : Dict [str , List [Edge ]],
186
- special_cases : Dict [str , SpecialCase ],
187
- size : int ):
189
+ def _make_subset (
190
+ db_graph : Dict [str , List [Edge ]],
191
+ special_cases : Dict [str , SpecialCase ],
192
+ size : int ,
193
+ create_arxiv_db_schema : bool ,
194
+ create_latexml_db_schema : bool ,
195
+ ):
188
196
"""
189
197
algorithm:
190
198
@@ -198,11 +206,18 @@ def _make_subset (db_graph: Dict[str, List[Edge]],
198
206
classic_session = session_factory ()
199
207
new_session = NewSessionLocal ()
200
208
201
- Base .metadata .drop_all (new_engine )
202
- Base .metadata .create_all (new_engine )
203
- LaTeXMLBase .metadata .drop_all (new_engine )
204
- LaTeXMLBase .metadata .create_all (new_engine )
205
-
209
+ if create_arxiv_db_schema :
210
+ Base .metadata .drop_all (new_engine )
211
+ Base .metadata .create_all (new_engine )
212
+
213
+ if create_latexml_db_schema :
214
+ LaTeXMLBase .metadata .drop_all (new_engine )
215
+ LaTeXMLBase .metadata .create_all (new_engine )
216
+
217
+ # check db connections
218
+ _any_tapir_user = classic_session .execute (select (TapirUser ).limit (1 )).scalars ().all ()
219
+ _any_latexml_doc = classic_session .execute (select (DBLaTeXMLDocuments ).limit (1 )).scalars ().all ()
220
+
206
221
### Do algorithm ###
207
222
table_lookup = { i .__tablename__ : i for i in get_tables () }
208
223
processing_order = topological_sort ({ k : list (map (lambda x : x .to_table , v )) for k ,v in db_graph .items () })
@@ -244,7 +259,8 @@ def _make_subset (db_graph: Dict[str, List[Edge]],
244
259
new_session .commit ()
245
260
new_session .close ()
246
261
247
- def clone_db_subset (n_users : int , config_directory : Optional [str ] = None ):
262
+ def clone_db_subset (n_users : int , config_directory : Optional [str ] = None ,
263
+ create_arxiv_db_schema : bool = True , create_latexml_db_schema : bool = True ,):
248
264
config_directory = config_directory or \
249
265
os .path .abspath (
250
266
os .path .join (
@@ -255,4 +271,32 @@ def clone_db_subset (n_users: int, config_directory: Optional[str] = None):
255
271
graph = json .loads (open (os .path .join (config_directory , 'graph.json' )).read ())
256
272
special_cases = json .loads (open (os .path .join (config_directory , 'special_cases.json' )).read ())
257
273
graph_with_edges = { k : list (map (lambda x : Edge (** x ), v )) for k ,v in graph .items () }
258
- _make_subset (graph_with_edges , special_cases , n_users )
274
+ _make_subset (graph_with_edges , special_cases , n_users , create_arxiv_db_schema , create_latexml_db_schema )
275
+
276
+
277
+ def main ():
278
+ # Set up argument parser
279
+ parser = argparse .ArgumentParser (description = "Clone a subset of the classic DB to a new DB." )
280
+
281
+ # Define arguments with environment variables as defaults
282
+ parser .add_argument ('--n-users' , type = int , default = os .environ .get ('N_USERS' , 2000 ),
283
+ help = 'Number of users to copy (default: N_USERS environment variable or 2000)' )
284
+ parser .add_argument ('--config-directory' , type = str , default = os .environ .get ('CONFIG_DIRECTORY' ),
285
+ help = 'Configuration directory (default: CONFIG_DIRECTORY environment variable)' )
286
+ parser .add_argument ('--create-arxiv-db-schema' , type = lambda x : x .lower () == 'true' ,
287
+ default = os .environ .get ('CREATE_ARXIV_DB_SCHEMA' , 'true' ).lower () == 'true' ,
288
+ help = 'Whether to create the arXiv DB schema (default: CREATE_ARXIV_DB_SCHEMA environment variable or true)' )
289
+ parser .add_argument ('--create-latexml-db-schema' , type = lambda x : x .lower () == 'true' ,
290
+ default = os .environ .get ('CREATE_LATEXML_DB_SCHEMA' , 'true' ).lower () == 'true' ,
291
+ help = 'Whether to create the LaTeXML DB schema (default: CREATE_LATEXML_DB_SCHEMA environment variable or true)' )
292
+
293
+ # Parse arguments
294
+ args = parser .parse_args ()
295
+
296
+ # Call the function with the parsed arguments
297
+ clone_db_subset (args .n_users , args .config_directory ,
298
+ args .create_arxiv_db_schema , args .create_latexml_db_schema )
299
+
300
+
301
+ if __name__ == '__main__' :
302
+ main ()
0 commit comments