@@ -42,11 +42,14 @@ class DataLocation:
42
42
"""
43
43
Represents the location of a file within a tar archive.
44
44
45
+ :param resource_id: The unique identifier for the resource.
46
+ :type resource_id: int
45
47
:param tar_file: The name of the tar file containing the data.
46
48
:type tar_file: str
47
49
:param filename: The name of the file within the tar archive.
48
50
:type filename: str
49
51
"""
52
+ resource_id : int
50
53
tar_file : str
51
54
filename : str
52
55
@@ -56,7 +59,8 @@ def _n_path(path):
56
59
Normalize a file path.
57
60
58
61
This function takes a file path and normalizes it by joining it with the root directory
59
- and then normalizing the resulting path.
62
+ and then normalizing the resulting path. It's useful for ensuring consistent path formats
63
+ across different operating systems.
60
64
61
65
:param path: The file path to normalize.
62
66
:type path: str
@@ -73,20 +77,29 @@ def _n_path(path):
73
77
class InvalidResourceDataError (Exception ):
74
78
"""
75
79
Base exception for invalid resource data.
80
+
81
+ This exception is raised when there's an issue with the resource data that prevents
82
+ it from being processed or used correctly.
76
83
"""
77
84
pass
78
85
79
86
80
87
class ResourceNotFoundError (InvalidResourceDataError ):
81
88
"""
82
89
Exception raised when a requested resource is not found.
90
+
91
+ This exception is typically raised when attempting to access or download a resource
92
+ that does not exist in the data pool.
83
93
"""
84
94
pass
85
95
86
96
87
97
class FileUnrecognizableError (Exception ):
88
98
"""
89
99
Exception raised when a file cannot be recognized or processed.
100
+
101
+ This exception is used when the system encounters a file that it cannot parse or
102
+ interpret according to the expected format or structure.
90
103
"""
91
104
pass
92
105
@@ -96,7 +109,10 @@ class DataPool:
96
109
Abstract base class for data pool operations.
97
110
98
111
This class defines the interface for data pool operations and provides a method
99
- for batch downloading resources to a directory.
112
+ for batch downloading resources to a directory. Subclasses should implement the
113
+ `mock_resource` method to provide specific functionality for different types of data pools.
114
+
115
+ The DataPool class is designed to be extended for various data sources and storage mechanisms.
100
116
"""
101
117
102
118
@contextmanager
@@ -105,7 +121,8 @@ def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str,
105
121
Context manager to mock a resource.
106
122
107
123
This method should be implemented by subclasses to provide a way to temporarily
108
- access a resource.
124
+ access a resource. It's typically used to download or generate a temporary copy
125
+ of the resource for processing.
109
126
110
127
:param resource_id: The ID of the resource to mock.
111
128
:param resource_info: Additional information about the resource.
@@ -120,6 +137,7 @@ def batch_download_to_directory(self, resource_ids, dst_dir: str, max_workers: i
120
137
Download multiple resources to a directory.
121
138
122
139
This method downloads a batch of resources to a specified directory, optionally saving metadata for each resource.
140
+ It uses a thread pool to parallelize downloads for improved performance.
123
141
124
142
:param resource_ids: List of resource IDs or tuples of (resource_id, resource_info) to download.
125
143
:type resource_ids: Iterable[Union[str, Tuple[str, Any]]]
@@ -133,6 +151,10 @@ def batch_download_to_directory(self, resource_ids, dst_dir: str, max_workers: i
133
151
:type metainfo_fmt: str
134
152
135
153
:raises OSError: If there's an issue creating the destination directory or copying files.
154
+
155
+ :example:
156
+ >>> data_pool = SomeDataPoolImplementation()
157
+ >>> data_pool.batch_download_to_directory(['resource1', 'resource2'], '/path/to/destination')
136
158
"""
137
159
pg_res = tqdm (resource_ids , desc = 'Batch Downloading' )
138
160
pg_downloaded = tqdm (desc = 'Files Downloaded' )
@@ -181,6 +203,7 @@ class HfBasedDataPool(DataPool):
181
203
Implementation of DataPool for Hugging Face datasets.
182
204
183
205
This class provides methods to interact with and download resources from Hugging Face datasets.
206
+ It handles the complexities of working with Hugging Face's repository structure and file organization.
184
207
185
208
:param data_repo_id: The ID of the Hugging Face dataset repository.
186
209
:type data_repo_id: str
@@ -190,6 +213,14 @@ class HfBasedDataPool(DataPool):
190
213
:type idx_repo_id: str
191
214
:param idx_revision: The revision of the index to use.
192
215
:type idx_revision: str
216
+ :param hf_token: Optional Hugging Face authentication token.
217
+ :type hf_token: Optional[str]
218
+
219
+ :example:
220
+ >>> data_pool = HfBasedDataPool('username/dataset', data_revision='main')
221
+ >>> with data_pool.mock_resource('resource1', None) as (path, info):
222
+ ... # Work with the resource at 'path'
223
+ ... pass
193
224
"""
194
225
195
226
def __init__ (self , data_repo_id : str , data_revision : str = 'main' ,
@@ -223,13 +254,17 @@ def _make_tar_info(self, tar_file: str, force: bool = False):
223
254
Create or retrieve information about a tar file.
224
255
225
256
This method lists the files in a tar archive and maps them to resource IDs.
257
+ It caches the information to avoid repeated API calls.
226
258
227
259
:param tar_file: The name of the tar file.
228
260
:type tar_file: str
229
261
:param force: Whether to force a refresh of the information.
230
262
:type force: bool
231
263
:return: A dictionary mapping resource IDs to lists of file paths.
232
264
:rtype: dict
265
+
266
+ :raises EntryNotFoundError: If the specified tar file is not found in the repository.
267
+ :raises RepositoryNotFoundError: If the specified repository is not found.
233
268
"""
234
269
key = _n_path (tar_file )
235
270
if force or key not in self ._tar_infos :
@@ -279,6 +314,11 @@ def _request_resource_by_id(self, resource_id) -> List[DataLocation]:
279
314
:param resource_id: The ID of the resource to request.
280
315
:return: A list of DataLocation objects representing the resource's locations.
281
316
:raises ResourceNotFoundError: If the resource is not found in any archive.
317
+
318
+ :example:
319
+ >>> data_pool = HfBasedDataPool('username/dataset')
320
+ >>> locations = data_pool._request_resource_by_id('resource1')
321
+ >>> print(locations[0].tar_file, locations[0].filename)
282
322
"""
283
323
for archive_file in self ._request_possible_archives (resource_id ):
284
324
try :
@@ -289,12 +329,25 @@ def _request_resource_by_id(self, resource_id) -> List[DataLocation]:
289
329
290
330
if resource_id in info :
291
331
return [
292
- DataLocation (tar_file = archive_file , filename = file )
332
+ DataLocation (resource_id = resource_id , tar_file = archive_file , filename = file )
293
333
for file in info [resource_id ]
294
334
]
295
335
else :
296
336
raise ResourceNotFoundError (f'Resource { resource_id !r} not found.' )
297
337
338
+ def _get_dst_filename (self , location : DataLocation ):
339
+ """
340
+ Get the destination filename for a given DataLocation.
341
+
342
+ This method determines the filename to use when saving a resource locally.
343
+
344
+ :param location: The DataLocation object containing information about the resource.
345
+ :type location: DataLocation
346
+ :return: The filename to use for the local copy of the resource.
347
+ :rtype: str
348
+ """
349
+ return os .path .basename (location .filename )
350
+
298
351
@contextmanager
299
352
def mock_resource (self , resource_id , resource_info ) -> ContextManager [Tuple [str , Any ]]:
300
353
"""
@@ -306,10 +359,17 @@ def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str,
306
359
:param resource_info: Additional information about the resource.
307
360
:return: A tuple containing the path to the temporary directory and the resource info.
308
361
:raises ResourceNotFoundError: If the resource cannot be found or downloaded.
362
+
363
+ :example:
364
+ >>> data_pool = HfBasedDataPool('username/dataset')
365
+ >>> with data_pool.mock_resource('resource1', {'metadata': 'value'}) as (path, info):
366
+ ... # Work with the resource at 'path'
367
+ ... print(f"Resource path: {path}")
368
+ ... print(f"Resource info: {info}")
309
369
"""
310
370
with TemporaryDirectory () as td :
311
371
for location in self ._request_resource_by_id (resource_id ):
312
- dst_filename = os .path .join (td , os . path . basename (location . filename ))
372
+ dst_filename = os .path .join (td , self . _get_dst_filename (location ))
313
373
hf_tar_file_download (
314
374
repo_id = self .data_repo_id ,
315
375
repo_type = 'dataset' ,
@@ -330,7 +390,8 @@ def id_modulo_cut(id_text: str):
330
390
"""
331
391
Cut an ID string into segments of 3 characters each, starting from the end.
332
392
333
- This function is used to create a hierarchical structure for IDs.
393
+ This function is used to create a hierarchical structure for IDs, which can be useful
394
+ for organizing files in a directory structure based on their IDs.
334
395
335
396
:param id_text: The ID string to cut.
336
397
:type id_text: str
0 commit comments