Skip to content

Commit 398cca6

Browse files
authored
Merge pull request #64 from deepghs/dev/idxcache
dev(narugo): use lru cache for index caches
2 parents 5103e66 + 32d3c54 commit 398cca6

File tree

8 files changed

+171
-8
lines changed

8 files changed

+171
-8
lines changed

docs/source/api_doc/index/fetch.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,10 @@ hf_tar_file_download
6161

6262

6363

64+
hf_tar_cache_reset
65+
----------------------------------------------
66+
67+
.. autofunction:: hf_tar_cache_reset
68+
69+
70+

docs/source/api_doc/index/local_fetch.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,10 @@ tar_file_download
4949

5050

5151

52+
tar_cache_reset
53+
----------------------------------------------
54+
55+
.. autofunction:: tar_cache_reset
56+
57+
58+

hfutils/index/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \
2-
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info
3-
from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, tar_list_files
2+
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info, \
3+
hf_tar_cache_reset
4+
from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, \
5+
tar_list_files, tar_cache_reset
46
from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory, \
57
tar_create_index_for_directory
68
from .validate import hf_tar_item_validate, hf_tar_validate

hfutils/index/fetch.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from collections import defaultdict
55
from typing import Optional, Dict, Union, List
66

7+
from cachetools import LRUCache
78
from huggingface_hub.file_download import http_get, hf_hub_url
89
from huggingface_hub.utils import build_hf_headers
910
from tqdm import tqdm
@@ -26,7 +27,7 @@ class ArchiveStandaloneFileHashNotMatch(Exception):
2627

2728

2829
_HF_TAR_IDX_LOCKS = defaultdict(threading.Lock)
29-
_HF_TAR_IDX_CACHE = {}
30+
_HF_TAR_IDX_CACHE = LRUCache(maxsize=192)
3031

3132

3233
def _hf_tar_get_cache_key(repo_id: str, archive_in_repo: str,
@@ -153,7 +154,7 @@ def hf_tar_get_index(repo_id: str, archive_in_repo: str,
153154
return idx_data
154155

155156

156-
_HF_TAR_IDX_PFILES_CACHE = {}
157+
_HF_TAR_IDX_PFILES_CACHE = LRUCache(maxsize=192)
157158

158159

159160
def _hf_tar_get_processed_files(repo_id: str, archive_in_repo: str,
@@ -718,3 +719,27 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st
718719
if os.path.exists(local_file):
719720
os.remove(local_file)
720721
raise
722+
723+
724+
def hf_tar_cache_reset(maxsize: Optional[int] = None):
725+
"""
726+
Reset the tar archive index caches and optionally resize them.
727+
728+
:param maxsize: New maximum size for the caches. If None, only clears the caches without resizing.
729+
:type maxsize: Optional[int]
730+
731+
This function performs two operations:
732+
733+
1. Clears both the index cache and processed files cache
734+
2. If maxsize is provided, recreates the caches with the new size
735+
736+
Example::
737+
>>> hf_tar_cache_reset() # Clear caches
738+
>>> hf_tar_cache_reset(maxsize=256) # Clear and resize caches
739+
"""
740+
global _HF_TAR_IDX_CACHE, _HF_TAR_IDX_PFILES_CACHE
741+
_HF_TAR_IDX_CACHE.clear()
742+
_HF_TAR_IDX_PFILES_CACHE.clear()
743+
if maxsize is not None and _HF_TAR_IDX_CACHE.maxsize != maxsize:
744+
_HF_TAR_IDX_CACHE = LRUCache(maxsize=maxsize)
745+
_HF_TAR_IDX_PFILES_CACHE = LRUCache(maxsize=maxsize)

hfutils/index/local_fetch.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@
1414
import os
1515
from typing import Optional, List
1616

17-
_TAR_IDX_CACHE = {}
17+
from cachetools import LRUCache
18+
19+
_TAR_IDX_CACHE = LRUCache(maxsize=192)
1820

1921

2022
def _tar_get_cache_key(archive_file: str, idx_file: Optional[str] = None):
@@ -77,7 +79,7 @@ def tar_get_index(archive_file: str, idx_file: Optional[str] = None, no_cache: b
7779
return idx_data
7880

7981

80-
_TAR_IDX_PFILES_CACHE = {}
82+
_TAR_IDX_PFILES_CACHE = LRUCache(maxsize=192)
8183

8284

8385
def _tar_get_processed_files(archive_file: str, idx_file: Optional[str] = None, no_cache: bool = False):
@@ -335,3 +337,26 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str,
335337
if os.path.exists(local_file):
336338
os.remove(local_file)
337339
raise
340+
341+
342+
def tar_cache_reset(maxsize: Optional[int] = None):
343+
"""
344+
Reset the tar index and processed files caches.
345+
346+
This function clears both the index cache and processed files cache.
347+
Optionally, it can also resize the caches.
348+
349+
:param maxsize: Optional new maximum size for the caches.
350+
If provided, both caches will be recreated with this size.
351+
:type maxsize: Optional[int]
352+
353+
:example:
354+
>>> tar_cache_reset(maxsize=256) # Reset and resize caches
355+
>>> tar_cache_reset() # Just clear the existing caches
356+
"""
357+
global _TAR_IDX_CACHE, _TAR_IDX_PFILES_CACHE
358+
_TAR_IDX_CACHE.clear()
359+
_TAR_IDX_PFILES_CACHE.clear()
360+
if maxsize is not None and _TAR_IDX_CACHE.maxsize != maxsize:
361+
_TAR_IDX_CACHE = LRUCache(maxsize=maxsize)
362+
_TAR_IDX_PFILES_CACHE = LRUCache(maxsize=maxsize)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ urlobject
99
fsspec>=2024; python_version > '3.8'
1010
fsspec>=2024,<=2025.3.0; python_version <= '3.8'
1111
random_user_agent
12+
cachetools

test/index/test_fetch.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import os.path
2+
from unittest.mock import patch, MagicMock
23

34
import pytest
45
from hbutils.testing import isolated_directory
56
from natsort import natsorted
67

78
from hfutils.index import hf_tar_list_files, hf_tar_file_exists, hf_tar_file_download, hf_tar_file_info, \
8-
hf_tar_file_size
9+
hf_tar_file_size, hf_tar_cache_reset
910
from test.testings import get_testfile, file_compare
1011

1112

@@ -201,3 +202,50 @@ def test_hf_tar_file_download_empty(self):
201202
local_file='2946001.',
202203
)
203204
assert os.path.getsize('2946001.') == 0
205+
206+
207+
@pytest.fixture
208+
def mock_lru_cache():
209+
"""Fixture to mock the LRUCache class and global variables."""
210+
mock_cache1 = MagicMock()
211+
mock_cache2 = MagicMock()
212+
213+
with patch('hfutils.index.fetch._HF_TAR_IDX_CACHE', mock_cache1), \
214+
patch('hfutils.index.fetch._HF_TAR_IDX_PFILES_CACHE', mock_cache2), \
215+
patch('hfutils.index.fetch.LRUCache') as mock_lru:
216+
yield mock_cache1, mock_cache2, mock_lru
217+
218+
219+
@pytest.mark.unittest
220+
class TestHfTarCacheReset:
221+
def test_reset_without_maxsize(self, mock_lru_cache):
222+
"""Test resetting the cache without changing the maxsize."""
223+
mock_cache1, mock_cache2, _ = mock_lru_cache
224+
225+
hf_tar_cache_reset()
226+
227+
mock_cache1.clear.assert_called_once()
228+
mock_cache2.clear.assert_called_once()
229+
230+
def test_reset_with_same_maxsize(self, mock_lru_cache):
231+
"""Test resetting the cache with the same maxsize."""
232+
mock_cache1, mock_cache2, mock_lru = mock_lru_cache
233+
mock_cache1.maxsize = 100
234+
235+
hf_tar_cache_reset(maxsize=100)
236+
237+
mock_cache1.clear.assert_called_once()
238+
mock_cache2.clear.assert_called_once()
239+
mock_lru.assert_not_called()
240+
241+
def test_reset_with_different_maxsize(self, mock_lru_cache):
242+
"""Test resetting the cache with a different maxsize."""
243+
mock_cache1, mock_cache2, mock_lru = mock_lru_cache
244+
mock_cache1.maxsize = 100
245+
246+
hf_tar_cache_reset(maxsize=200)
247+
248+
mock_cache1.clear.assert_called_once()
249+
mock_cache2.clear.assert_called_once()
250+
assert mock_lru.call_count == 2
251+
mock_lru.assert_called_with(maxsize=200)

test/index/test_local_fetch.py

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
import os.path
2+
from unittest.mock import MagicMock, patch
23

34
import pytest
45
from hbutils.testing import isolated_directory
56
from natsort import natsorted
67

78
from hfutils.index import tar_list_files, tar_file_exists, tar_file_download, tar_file_info, \
8-
tar_file_size
9+
tar_file_size, tar_cache_reset
910
from test.testings import get_testfile, file_compare
1011

1112

@@ -178,3 +179,50 @@ def test_tar_file_download_empty(self, local_narugo_test_cos5t_tars):
178179
local_file='empty_file',
179180
)
180181
assert os.path.getsize('empty_file') == 0
182+
183+
184+
@pytest.fixture
185+
def mock_lru_cache():
186+
"""Fixture to mock the LRUCache class and global variables."""
187+
mock_cache1 = MagicMock()
188+
mock_cache2 = MagicMock()
189+
190+
with patch('hfutils.index.local_fetch._TAR_IDX_CACHE', mock_cache1), \
191+
patch('hfutils.index.local_fetch._TAR_IDX_PFILES_CACHE', mock_cache2), \
192+
patch('hfutils.index.local_fetch.LRUCache') as mock_lru:
193+
yield mock_cache1, mock_cache2, mock_lru
194+
195+
196+
@pytest.mark.unittest
197+
class TestTarCacheReset:
198+
def test_reset_without_maxsize(self, mock_lru_cache):
199+
"""Test resetting the cache without changing the maxsize."""
200+
mock_cache1, mock_cache2, _ = mock_lru_cache
201+
202+
tar_cache_reset()
203+
204+
mock_cache1.clear.assert_called_once()
205+
mock_cache2.clear.assert_called_once()
206+
207+
def test_reset_with_same_maxsize(self, mock_lru_cache):
208+
"""Test resetting the cache with the same maxsize."""
209+
mock_cache1, mock_cache2, mock_lru = mock_lru_cache
210+
mock_cache1.maxsize = 100
211+
212+
tar_cache_reset(maxsize=100)
213+
214+
mock_cache1.clear.assert_called_once()
215+
mock_cache2.clear.assert_called_once()
216+
mock_lru.assert_not_called()
217+
218+
def test_reset_with_different_maxsize(self, mock_lru_cache):
219+
"""Test resetting the cache with a different maxsize."""
220+
mock_cache1, mock_cache2, mock_lru = mock_lru_cache
221+
mock_cache1.maxsize = 100
222+
223+
tar_cache_reset(maxsize=200)
224+
225+
mock_cache1.clear.assert_called_once()
226+
mock_cache2.clear.assert_called_once()
227+
assert mock_lru.call_count == 2
228+
mock_lru.assert_called_with(maxsize=200)

0 commit comments

Comments
 (0)