Shorten cache filenames to fit eCryptfs 143-byte NAME_MAX limit

Chessing234 · claude · Chessing234 · commit ab525d49f070 · 2026-04-08T08:52:35.000+05:30
url_to_filename() was appending the full trailing URL path component (e.g. tfidf_vectors_sparse.npz) to the hash-based filename, producing names up to 154 characters. This exceeds the 143-byte NAME_MAX on eCryptfs-encrypted filesystems, causing OSError: File name too long. Now only the file extension is preserved (e.g. .npz), keeping the worst-case filename (including .json sidecar) under 143 bytes. _find_existing_cache_file() matches both old and new filename formats for backward compatibility. Fixes #539 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/scispacy/file_cache.py b/scispacy/file_cache.py
@@ -55,9 +55,15 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
     Convert `url` into a hashed filename in a repeatable way.
     If `etag` is specified, append its hash to the url's, delimited
     by a period.
-    """
 
+    Only the file extension from the original URL is preserved (not the
+    full trailing path component) to keep filenames short enough for
+    filesystems with a 143-byte NAME_MAX (e.g. eCryptfs).
+    See: https://github.com/allenai/scispacy/issues/539
+    """
     last_part = url.split("/")[-1]
+    _, ext = os.path.splitext(last_part)
+
     url_bytes = url.encode("utf-8")
     url_hash = sha256(url_bytes)
     filename = url_hash.hexdigest()
@@ -67,7 +73,7 @@ def url_to_filename(url: str, etag: Optional[str] = None) -> str:
         etag_hash = sha256(etag_bytes)
         filename += "." + etag_hash.hexdigest()
 
-    filename += "." + last_part
+    filename += ext
     return filename
 
 
@@ -106,6 +112,33 @@ def http_get(url: str, temp_file: IO) -> None:
     pbar.close()
 
 
+def _find_existing_cache_file(url: str, cache_dir: str) -> Optional[str]:
+    """
+    Check if a cached file already exists for the given URL.
+    Since the filename includes the etag (which we may not have without a
+    network call), we look for any file matching the URL hash prefix.
+
+    Supports both old-format filenames (<url_hash>.<etag_hash>.<last_part>)
+    and new-format filenames (<url_hash>.<etag_hash>.<ext>).
+    """
+    url_bytes = url.encode("utf-8")
+    url_hash = sha256(url_bytes).hexdigest()
+    last_part = url.split("/")[-1]
+    _, ext = os.path.splitext(last_part)
+
+    for filename in os.listdir(cache_dir):
+        if filename.endswith(".json") or not os.path.isfile(
+            os.path.join(cache_dir, filename)
+        ):
+            continue
+        if not filename.startswith(url_hash):
+            continue
+        if filename.endswith("." + last_part) or filename.endswith(ext):
+            return os.path.join(cache_dir, filename)
+    return None
+
+
+
 def get_from_cache(url: str, cache_dir: Optional[str] = None) -> str:
     """
     Given a URL, look for the corresponding dataset in the local cache.
diff --git a/tests/test_file_cache.py b/tests/test_file_cache.py
@@ -59,3 +59,23 @@ def test_url_to_filename_with_etags_eliminates_quotes(self):
             back_to_url, etag = filename_to_url(filename, cache_dir=self.TEST_DIR)
             assert back_to_url == url
             assert etag == "mytag"
+
+    def test_url_to_filename_length_under_ecryptfs_limit(self):
+        """Filenames (including .json sidecar) must stay under 143 bytes for eCryptfs.
+        See: https://github.com/allenai/scispacy/issues/539
+        """
+        # These are the actual URLs used by scispacy linkers
+        urls = [
+            'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/nmslib_index.bin',
+            'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectorizer.joblib',
+            'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/tfidf_vectors_sparse.npz',
+            'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2023-04-23/umls/concept_aliases.json',
+        ]
+        # Simulate a realistic 64-char hex ETag
+        long_etag = '"d41d8cd98f00b204e9800998ecf8427ed41d8cd98f00b204e9800998ecf8427e"'
+        for url in urls:
+            filename = url_to_filename(url, etag=long_etag)
+            meta_filename = filename + ".json"
+            assert len(meta_filename) < 143, (
+                f"Metadata filename too long for eCryptfs ({len(meta_filename)} >= 143): {meta_filename}"
+            )