Skip to content

Commit cf4985b

Browse files
authored
Merge pull request #6 from deepghs/dev/table
dev(narugo): add table-based data pool support
2 parents 2f7ed52 + c23e500 commit cf4985b

20 files changed

+588
-22
lines changed

cheesechaser/datapool/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,15 @@
22
from .bangumibase import BangumiBaseDataPool
33
from .base import DataLocation, DataPool, HfBasedDataPool, IncrementIDDataPool, InvalidResourceDataError, FileUnrecognizableError, ResourceNotFoundError
44
from .civitai import CivitaiDataPool
5-
from .danbooru import DanbooruDataPool, DanbooruStableDataPool, DanbooruNewestDataPool, DanbooruWebpDataPool, DanbooruNewestWebpDataPool
5+
from .danbooru import DanbooruDataPool, DanbooruStableDataPool, DanbooruNewestDataPool, DanbooruWebpDataPool, DanbooruNewestWebpDataPool
66
from .fancaps import FancapsDataPool
77
from .gelbooru import GelbooruDataPool, GelbooruWebpDataPool
88
from .hentaicosplay import HentaiCosplayDataPool
99
from .konachan import KonachanDataPool
1010
from .nhentai import NHentaiImagesDataPool, NHentaiMangaDataPool
1111
from .nozomi import NozomiDataPool
1212
from .realbooru import RealbooruDataPool
13+
from .table import TableBasedHfDataPool, SimpleTableHfDataPool
1314
from .threedbooru import ThreedbooruDataPool
1415
from .yande import YandeDataPool
1516
from .zerochan import ZerochanWebpDataPool, ZerochanDataPool

cheesechaser/datapool/anime_pictures.py

+13
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
It defines a class `AnimePicturesDataPool` which inherits from `IncrementIDDataPool`.
55
This class is designed to manage and access a repository of anime pictures,
66
utilizing an incremental ID system for efficient data retrieval.
7+
8+
.. note::
9+
The dataset `deepghs/anime_pictures_full <https://huggingface.co/datasets/deepghs/anime_pictures_full>`_
10+
is gated, you have to get the access of it before using this module.
711
"""
12+
813
from typing import Optional
914

1015
from .base import IncrementIDDataPool
@@ -22,10 +27,13 @@ class AnimePicturesDataPool(IncrementIDDataPool):
2227
2328
:param revision: The revision of the data to use, defaults to 'main'.
2429
:type revision: str
30+
:param hf_token: Optional Hugging Face token for authentication, defaults to None.
31+
:type hf_token: Optional[str]
2532
2633
Usage:
2734
>>> pool = AnimePicturesDataPool()
2835
>>> pool = AnimePicturesDataPool(revision='v1.0')
36+
>>> pool = AnimePicturesDataPool(revision='main', hf_token='your_hf_token')
2937
3038
.. note::
3139
The class uses the same repository for both data and index storage.
@@ -35,8 +43,13 @@ def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
3543
"""
3644
Initialize the AnimePicturesDataPool.
3745
46+
This method sets up the data pool by calling the parent class constructor
47+
with specific parameters for the anime pictures repository.
48+
3849
:param revision: The revision of the data to use, defaults to 'main'.
3950
:type revision: str
51+
:param hf_token: Optional Hugging Face token for authentication, defaults to None.
52+
:type hf_token: Optional[str]
4053
"""
4154
IncrementIDDataPool.__init__(
4255
self,

cheesechaser/datapool/bangumibase.py

+28
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
The BangumiBase dataset is a comprehensive collection of anime and manga information.
55
This module extends the IncrementIDDataPool to specifically handle the BangumiBase dataset,
66
providing an easy-to-use interface for accessing and managing this data.
7+
8+
.. note::
9+
The dataset `deepghs/bangumibase_full <https://huggingface.co/datasets/deepghs/bangumibase_full>`_
10+
is gated, you have to get the access of it before using this module.
711
"""
12+
813
from typing import Optional
914

1015
from .base import IncrementIDDataPool
@@ -20,18 +25,41 @@ class BangumiBaseDataPool(IncrementIDDataPool):
2025
for the BangumiBase dataset. It simplifies the process of initializing the
2126
data pool with the correct repository and revision information.
2227
28+
The BangumiBaseDataPool allows users to easily interact with the BangumiBase
29+
dataset, providing methods for retrieving, updating, and managing anime and
30+
manga information.
31+
2332
:param revision: The specific revision of the BangumiBase dataset to use.
2433
Defaults to 'main'.
2534
:type revision: str
35+
:param hf_token: An optional Hugging Face token for accessing private repositories.
36+
Defaults to None.
37+
:type hf_token: Optional[str]
38+
39+
:example:
40+
41+
To create a BangumiBaseDataPool instance:
42+
43+
>>> pool = BangumiBaseDataPool()
44+
>>> # Or with a specific revision
45+
>>> pool = BangumiBaseDataPool(revision='v1.2.3')
46+
>>> # Or with a Hugging Face token
47+
>>> pool = BangumiBaseDataPool(hf_token='your_hf_token_here')
2648
"""
2749

2850
def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
2951
"""
3052
Initialize the BangumiBaseDataPool.
3153
54+
This constructor sets up the data pool with the BangumiBase dataset repository
55+
and the specified revision. It uses the same repository for both data and index.
56+
3257
:param revision: The specific revision of the BangumiBase dataset to use.
3358
Defaults to 'main'.
3459
:type revision: str
60+
:param hf_token: An optional Hugging Face token for accessing private repositories.
61+
Defaults to None.
62+
:type hf_token: Optional[str]
3563
"""
3664
IncrementIDDataPool.__init__(
3765
self,

cheesechaser/datapool/base.py

+67-6
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,14 @@ class DataLocation:
4242
"""
4343
Represents the location of a file within a tar archive.
4444
45+
:param resource_id: The unique identifier for the resource.
46+
:type resource_id: int
4547
:param tar_file: The name of the tar file containing the data.
4648
:type tar_file: str
4749
:param filename: The name of the file within the tar archive.
4850
:type filename: str
4951
"""
52+
resource_id: int
5053
tar_file: str
5154
filename: str
5255

@@ -56,7 +59,8 @@ def _n_path(path):
5659
Normalize a file path.
5760
5861
This function takes a file path and normalizes it by joining it with the root directory
59-
and then normalizing the resulting path.
62+
and then normalizing the resulting path. It's useful for ensuring consistent path formats
63+
across different operating systems.
6064
6165
:param path: The file path to normalize.
6266
:type path: str
@@ -73,20 +77,29 @@ def _n_path(path):
7377
class InvalidResourceDataError(Exception):
7478
"""
7579
Base exception for invalid resource data.
80+
81+
This exception is raised when there's an issue with the resource data that prevents
82+
it from being processed or used correctly.
7683
"""
7784
pass
7885

7986

8087
class ResourceNotFoundError(InvalidResourceDataError):
8188
"""
8289
Exception raised when a requested resource is not found.
90+
91+
This exception is typically raised when attempting to access or download a resource
92+
that does not exist in the data pool.
8393
"""
8494
pass
8595

8696

8797
class FileUnrecognizableError(Exception):
8898
"""
8999
Exception raised when a file cannot be recognized or processed.
100+
101+
This exception is used when the system encounters a file that it cannot parse or
102+
interpret according to the expected format or structure.
90103
"""
91104
pass
92105

@@ -96,7 +109,10 @@ class DataPool:
96109
Abstract base class for data pool operations.
97110
98111
This class defines the interface for data pool operations and provides a method
99-
for batch downloading resources to a directory.
112+
for batch downloading resources to a directory. Subclasses should implement the
113+
`mock_resource` method to provide specific functionality for different types of data pools.
114+
115+
The DataPool class is designed to be extended for various data sources and storage mechanisms.
100116
"""
101117

102118
@contextmanager
@@ -105,7 +121,8 @@ def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str,
105121
Context manager to mock a resource.
106122
107123
This method should be implemented by subclasses to provide a way to temporarily
108-
access a resource.
124+
access a resource. It's typically used to download or generate a temporary copy
125+
of the resource for processing.
109126
110127
:param resource_id: The ID of the resource to mock.
111128
:param resource_info: Additional information about the resource.
@@ -120,6 +137,7 @@ def batch_download_to_directory(self, resource_ids, dst_dir: str, max_workers: i
120137
Download multiple resources to a directory.
121138
122139
This method downloads a batch of resources to a specified directory, optionally saving metadata for each resource.
140+
It uses a thread pool to parallelize downloads for improved performance.
123141
124142
:param resource_ids: List of resource IDs or tuples of (resource_id, resource_info) to download.
125143
:type resource_ids: Iterable[Union[str, Tuple[str, Any]]]
@@ -133,6 +151,10 @@ def batch_download_to_directory(self, resource_ids, dst_dir: str, max_workers: i
133151
:type metainfo_fmt: str
134152
135153
:raises OSError: If there's an issue creating the destination directory or copying files.
154+
155+
:example:
156+
>>> data_pool = SomeDataPoolImplementation()
157+
>>> data_pool.batch_download_to_directory(['resource1', 'resource2'], '/path/to/destination')
136158
"""
137159
pg_res = tqdm(resource_ids, desc='Batch Downloading')
138160
pg_downloaded = tqdm(desc='Files Downloaded')
@@ -181,6 +203,7 @@ class HfBasedDataPool(DataPool):
181203
Implementation of DataPool for Hugging Face datasets.
182204
183205
This class provides methods to interact with and download resources from Hugging Face datasets.
206+
It handles the complexities of working with Hugging Face's repository structure and file organization.
184207
185208
:param data_repo_id: The ID of the Hugging Face dataset repository.
186209
:type data_repo_id: str
@@ -190,6 +213,14 @@ class HfBasedDataPool(DataPool):
190213
:type idx_repo_id: str
191214
:param idx_revision: The revision of the index to use.
192215
:type idx_revision: str
216+
:param hf_token: Optional Hugging Face authentication token.
217+
:type hf_token: Optional[str]
218+
219+
:example:
220+
>>> data_pool = HfBasedDataPool('username/dataset', data_revision='main')
221+
>>> with data_pool.mock_resource('resource1', None) as (path, info):
222+
... # Work with the resource at 'path'
223+
... pass
193224
"""
194225

195226
def __init__(self, data_repo_id: str, data_revision: str = 'main',
@@ -223,13 +254,17 @@ def _make_tar_info(self, tar_file: str, force: bool = False):
223254
Create or retrieve information about a tar file.
224255
225256
This method lists the files in a tar archive and maps them to resource IDs.
257+
It caches the information to avoid repeated API calls.
226258
227259
:param tar_file: The name of the tar file.
228260
:type tar_file: str
229261
:param force: Whether to force a refresh of the information.
230262
:type force: bool
231263
:return: A dictionary mapping resource IDs to lists of file paths.
232264
:rtype: dict
265+
266+
:raises EntryNotFoundError: If the specified tar file is not found in the repository.
267+
:raises RepositoryNotFoundError: If the specified repository is not found.
233268
"""
234269
key = _n_path(tar_file)
235270
if force or key not in self._tar_infos:
@@ -279,6 +314,11 @@ def _request_resource_by_id(self, resource_id) -> List[DataLocation]:
279314
:param resource_id: The ID of the resource to request.
280315
:return: A list of DataLocation objects representing the resource's locations.
281316
:raises ResourceNotFoundError: If the resource is not found in any archive.
317+
318+
:example:
319+
>>> data_pool = HfBasedDataPool('username/dataset')
320+
>>> locations = data_pool._request_resource_by_id('resource1')
321+
>>> print(locations[0].tar_file, locations[0].filename)
282322
"""
283323
for archive_file in self._request_possible_archives(resource_id):
284324
try:
@@ -289,12 +329,25 @@ def _request_resource_by_id(self, resource_id) -> List[DataLocation]:
289329

290330
if resource_id in info:
291331
return [
292-
DataLocation(tar_file=archive_file, filename=file)
332+
DataLocation(resource_id=resource_id, tar_file=archive_file, filename=file)
293333
for file in info[resource_id]
294334
]
295335
else:
296336
raise ResourceNotFoundError(f'Resource {resource_id!r} not found.')
297337

338+
def _get_dst_filename(self, location: DataLocation):
339+
"""
340+
Get the destination filename for a given DataLocation.
341+
342+
This method determines the filename to use when saving a resource locally.
343+
344+
:param location: The DataLocation object containing information about the resource.
345+
:type location: DataLocation
346+
:return: The filename to use for the local copy of the resource.
347+
:rtype: str
348+
"""
349+
return os.path.basename(location.filename)
350+
298351
@contextmanager
299352
def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str, Any]]:
300353
"""
@@ -306,10 +359,17 @@ def mock_resource(self, resource_id, resource_info) -> ContextManager[Tuple[str,
306359
:param resource_info: Additional information about the resource.
307360
:return: A tuple containing the path to the temporary directory and the resource info.
308361
:raises ResourceNotFoundError: If the resource cannot be found or downloaded.
362+
363+
:example:
364+
>>> data_pool = HfBasedDataPool('username/dataset')
365+
>>> with data_pool.mock_resource('resource1', {'metadata': 'value'}) as (path, info):
366+
... # Work with the resource at 'path'
367+
... print(f"Resource path: {path}")
368+
... print(f"Resource info: {info}")
309369
"""
310370
with TemporaryDirectory() as td:
311371
for location in self._request_resource_by_id(resource_id):
312-
dst_filename = os.path.join(td, os.path.basename(location.filename))
372+
dst_filename = os.path.join(td, self._get_dst_filename(location))
313373
hf_tar_file_download(
314374
repo_id=self.data_repo_id,
315375
repo_type='dataset',
@@ -330,7 +390,8 @@ def id_modulo_cut(id_text: str):
330390
"""
331391
Cut an ID string into segments of 3 characters each, starting from the end.
332392
333-
This function is used to create a hierarchical structure for IDs.
393+
This function is used to create a hierarchical structure for IDs, which can be useful
394+
for organizing files in a directory structure based on their IDs.
334395
335396
:param id_text: The ID string to cut.
336397
:type id_text: str

cheesechaser/datapool/civitai.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,13 @@
33
44
The CivitaiDataPool class extends the IncrementIDDataPool to specifically handle data from Civitai.
55
It uses a predefined repository to store and retrieve data.
6+
7+
Classes:
8+
CivitaiDataPool: A data pool class for managing Civitai data.
9+
10+
.. note::
11+
The dataset `deepghs/civitai_full <https://huggingface.co/datasets/deepghs/civitai_full>`_
12+
is gated, you have to get the access of it before using this module.
613
"""
714

815
from typing import Optional
@@ -22,17 +29,31 @@ class CivitaiDataPool(IncrementIDDataPool):
2229
2330
:param revision: The specific revision of the data to use, defaults to 'main'.
2431
:type revision: str
32+
:param hf_token: An optional Hugging Face token for authentication, defaults to None.
33+
:type hf_token: Optional[str]
2534
2635
Usage:
2736
>>> civitai_pool = CivitaiDataPool()
37+
>>> civitai_pool_with_token = CivitaiDataPool(hf_token='your_token_here')
38+
>>> specific_revision_pool = CivitaiDataPool(revision='v1.0')
39+
40+
Note:
41+
The CivitaiDataPool uses a predefined repository (_CIVITAI_REPO) for both data and index storage.
42+
This ensures consistency and ease of use when working with Civitai data.
2843
"""
2944

3045
def __init__(self, revision: str = 'main', hf_token: Optional[str] = None):
3146
"""
32-
Initialize the CivitaiDataPool with the specified revision.
47+
Initialize the CivitaiDataPool with the specified revision and optional Hugging Face token.
3348
3449
:param revision: The specific revision of the data to use, defaults to 'main'.
3550
:type revision: str
51+
:param hf_token: An optional Hugging Face token for authentication, defaults to None.
52+
:type hf_token: Optional[str]
53+
54+
This method sets up the CivitaiDataPool by initializing the parent IncrementIDDataPool
55+
with specific parameters tailored for Civitai data. It uses the same repository for
56+
both data and index storage, ensuring data consistency.
3657
"""
3758
IncrementIDDataPool.__init__(
3859
self,

0 commit comments

Comments
 (0)