Skip to content

Commit 0bbe2a5

Browse files
jeandetclaude
andcommitted
Don't cache HTTP error responses in the remote file cache
_cached_get_remote_file stored resp.bytes without checking the HTTP status (unlike its non-cached sibling _remote_open): a transient 502 Bad Gateway page got cached as the file content, permanently poisoning the cache for that URL — every later read handed the HTML error page to the consumer (e.g. netCDF4 failing with "Unknown file format" long after the server recovered). Raise IOError on non-200 responses before touching the cache, exactly like _remote_open does. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1 parent 377c58b commit 0bbe2a5

2 files changed

Lines changed: 22 additions & 1 deletion

File tree

speasy/core/any_files.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def _cached_get_remote_file(url, timeout: int = http.DEFAULT_TIMEOUT, headers: d
8383
entry = get_item(url)
8484
if not isinstance(entry, CacheItem) or (not prefer_cache and _is_outdated(entry, url)):
8585
resp = http.urlopen(url=url, headers=headers, timeout=timeout)
86+
if resp.status != 200:
87+
# Never cache error responses: a transient 502 page stored as
88+
# the file content poisons the cache until manually purged.
89+
raise IOError(f"Could not open remote file {url}: HTTP {resp.status}")
8690
last_modified = resp.headers.get('last-modified', str(datetime.now()))
8791
if 'b' in mode:
8892
entry = CacheItem(data=resp.bytes, version=last_modified)

tests/test_file_access.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
from ddt import ddt, data, unpack
1212

1313
from speasy.core.any_files import any_loc_open, list_files
14-
from speasy.core.cache import drop_item
14+
from speasy.core.cache import drop_item, get_item
1515
from multiprocessing import Value, Process
16+
from unittest.mock import patch, MagicMock
1617

1718
_HERE_ = os.path.dirname(os.path.abspath(__file__))
1819

@@ -65,6 +66,22 @@ def test_simple_remote_bin_file_with_rewrite_rules(self):
6566
self.assertIsNotNone(f)
6667
self.assertIn(b'NSSDC Common Data Format', f.read(100))
6768

69+
def test_http_error_responses_raise_and_are_not_cached(self):
70+
# A transient 502 used to be stored in the cache as the file content,
71+
# permanently poisoning it: every later read handed the HTML error
72+
# page to the consumer (e.g. netCDF4 -> "Unknown file format").
73+
url = "https://test.invalid/some_file_behind_flaky_server.nc"
74+
drop_item(url)
75+
error_resp = MagicMock()
76+
error_resp.status = 502
77+
error_resp.bytes = b"<html>502 Bad Gateway</html>"
78+
error_resp.text = "<html>502 Bad Gateway</html>"
79+
error_resp.headers = {}
80+
with patch("speasy.core.any_files.http.urlopen", return_value=error_resp):
81+
with self.assertRaises(IOError):
82+
any_loc_open(url, mode='rb', cache_remote_files=True)
83+
self.assertIsNone(get_item(url), "HTTP error body must never be cached")
84+
6885
def test_cached_remote_bin_file(self):
6986
drop_item("https://hephaistos.lpp.polytechnique.fr/data/LFR/SW/LFR-FSW/3.0.0.0/fsw")
7087
start = datetime.now()

0 commit comments

Comments
 (0)