Skip to content

Commit ab525d4

Browse files
Chessing234claude
andcommitted
Shorten cache filenames to fit eCryptfs 143-byte NAME_MAX limit
url_to_filename() was appending the full trailing URL path component (e.g. tfidf_vectors_sparse.npz) to the hash-based filename, producing names up to 154 characters. This exceeds the 143-byte NAME_MAX on eCryptfs-encrypted filesystems, causing OSError: File name too long. Now only the file extension is preserved (e.g. .npz), keeping the worst-case filename (including .json sidecar) under 143 bytes. _find_existing_cache_file() matches both old and new filename formats for backward compatibility. Fixes #539 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent eacccd4 commit ab525d4

2 files changed

Lines changed: 55 additions & 2 deletions

File tree

scispacy/file_cache.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,15 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
5555
Convert `url` into a hashed filename in a repeatable way.
5656
If `etag` is specified, append its hash to the url's, delimited
5757
by a period.
58-
"""
5958
59+
Only the file extension from the original URL is preserved (not the
60+
full trailing path component) to keep filenames short enough for
61+
filesystems with a 143-byte NAME_MAX (e.g. eCryptfs).
62+
See: https://github.com/allenai/scispacy/issues/539
63+
"""
6064
last_part = url.split("/")[-1]
65+
_, ext = os.path.splitext(last_part)
66+
6167
url_bytes = url.encode("utf-8")
6268
url_hash = sha256(url_bytes)
6369
filename = url_hash.hexdigest()
@@ -67,7 +73,7 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
6773
etag_hash = sha256(etag_bytes)
6874
filename += "." + etag_hash.hexdigest()
6975

70-
filename += "." + last_part
76+
filename += ext
7177
return filename
7278

7379

@@ -106,6 +112,33 @@ def http_get(url: str, temp_file: IO) -> None:
106112
pbar.close()
107113

108114

115+
def _find_existing_cache_file(url: str, cache_dir: str) -> Optional[str]:
116+
"""
117+
Check if a cached file already exists for the given URL.
118+
Since the filename includes the etag (which we may not have without a
119+
network call), we look for any file matching the URL hash prefix.
120+
121+
Supports both old-format filenames (<url_hash>.<etag_hash>.<last_part>)
122+
and new-format filenames (<url_hash>.<etag_hash>.<ext>).
123+
"""
124+
url_bytes = url.encode("utf-8")
125+
url_hash = sha256(url_bytes).hexdigest()
126+
last_part = url.split("/")[-1]
127+
_, ext = os.path.splitext(last_part)
128+
129+
for filename in os.listdir(cache_dir):
130+
if filename.endswith(".json") or not os.path.isfile(
131+
os.path.join(cache_dir, filename)
132+
):
133+
continue
134+
if not filename.startswith(url_hash):
135+
continue
136+
if filename.endswith("." + last_part) or filename.endswith(ext):
137+
return os.path.join(cache_dir, filename)
138+
return None
139+
140+
141+
109142
def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str:
110143
"""
111144
Given a URL, look for the corresponding dataset in the local cache.

tests/test_file_cache.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,23 @@ def test_url_to_filename_with_etags_eliminates_quotes(self):
5959
back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
6060
assert back_to_url == url
6161
assert etag == "mytag"
62+
63+
def test_url_to_filename_length_under_ecryptfs_limit(self):
64+
"""Filenames (including .json sidecar) must stay under 143 bytes for eCryptfs.
65+
See: https://github.com/allenai/scispacy/issues/539
66+
"""
67+
# These are the actual URLs used by scispacy linkers
68+
urls = [
69+
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin',
70+
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib',
71+
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz',
72+
'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json',
73+
]
74+
# Simulate a realistic 64-char hex ETag
75+
long_etag = '"d41d8cd98f00b204e9800998ecf8427ed41d8cd98f00b204e9800998ecf8427e"'
76+
for url in urls:
77+
filename = url_to_filename(url, etag=long_etag)
78+
meta_filename = filename + ".json"
79+
assert len(meta_filename) < 143, (
80+
f"Metadata filename too long for eCryptfs ({len(meta_filename)} >= 143): {meta_filename}"
81+
)

0 commit comments

Comments
 (0)