redpanda-data · andrwng · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/src/v/cloud_io/BUILD b/src/v/cloud_io/BUILD
@@ -148,15 +148,18 @@ redpanda_cc_library(
         "cache_probe.cc",
         "cache_service.cc",
         "recursive_directory_walker.cc",
+        "staging_file.cc",
     ],
     hdrs = [
         "access_time_tracker.h",
         "cache_probe.h",
         "cache_service.h",
         "recursive_directory_walker.h",
+        "staging_file.h",
     ],
     implementation_deps = [
         ":logger",
+        "//src/v/bytes:iobuf",
         "//src/v/bytes:iobuf_parser",
         "//src/v/container:chunked_vector",
         "//src/v/serde",

diff --git a/src/v/cloud_io/basic_cache_service_api.cc b/src/v/cloud_io/basic_cache_service_api.cc
@@ -30,6 +30,15 @@ std::ostream& operator<<(std::ostream& o, cache_element_status s) {
     return o;
 }
 
+template<class Clock>
+void basic_space_reservation_guard<Clock>::merge(
+  basic_space_reservation_guard&& other) noexcept {
+    _bytes += other._bytes;
+    _objects += other._objects;
+    other._bytes = 0;
+    other._objects = 0;
+}
+
 template<class Clock>
 void basic_space_reservation_guard<Clock>::wrote_data(
   uint64_t written_bytes, size_t written_objects) {

diff --git a/src/v/cloud_io/basic_cache_service_api.h b/src/v/cloud_io/basic_cache_service_api.h
@@ -75,6 +75,10 @@ class basic_space_reservation_guard {
 
     ~basic_space_reservation_guard();
 
+    /// Merges another guard's reservation into this one. The other guard is
+    /// left empty (zero bytes/objects).
+    void merge(basic_space_reservation_guard&& other) noexcept;
+
     /// After completing the write operation that this space reservation
     /// protected, indicate how many bytes were really written: this is used to
     /// atomically update cache usage stats to free the reservation and update

diff --git a/src/v/cloud_io/cache_service.cc b/src/v/cloud_io/cache_service.cc
@@ -269,14 +269,18 @@ std::optional<std::chrono::milliseconds> cache::get_trim_delay() const {
 
 ss::future<> cache::trim_throttled_unlocked(
   std::optional<uint64_t> size_limit_override,
-  std::optional<size_t> object_limit_override) {
+  std::optional<size_t> object_limit_override,
+  std::optional<ss::lowres_clock::time_point> deadline) {
     // If we trimmed very recently then do not do it immediately:
     // this reduces load and improves chance of currently promoted
     // segments finishing their read work before we demote their
     // data from cache.
     auto trim_delay = get_trim_delay();
 
     if (trim_delay.has_value()) {
+        if (deadline && ss::lowres_clock::now() + *trim_delay > *deadline) {
+            throw ss::timed_out_error();
+        }
         vlog(
           log.info,
           "Cache trimming throttled, waiting {}ms",
@@ -828,9 +832,13 @@ cache::trim_exhaustive(uint64_t size_to_delete, size_t objects_to_delete) {
             continue;
         }
 
-        // Unlike the fast trim, we *do not* skip .tmp files.  This is to handle
-        // the case where we have some abandoned tmp files, and have hit the
-        // exhaustive trim because they are occupying too much space.
+        if (
+          std::string_view(file_stat.path)
+            .ends_with(cache_tmp_file_extension)) {
+            result.trim_missed_tmp_files = true;
+            continue;
+        }
+
         try {
             co_await delete_file_and_empty_parents(file_stat.path);
             _access_time_tracker.remove(file_stat.path);
@@ -851,23 +859,6 @@ cache::trim_exhaustive(uint64_t size_to_delete, size_t objects_to_delete) {
         } catch (const ss::gate_closed_exception&) {
             // We are shutting down, stop iterating and propagate
             throw;
-        } catch (const std::filesystem::filesystem_error& e) {
-            if (likely(file_stat.path.ends_with(cache_tmp_file_extension))) {
-                // In exhaustive scan we might hit a .part file and get ENOENT,
-                // this is expected behavior occasionally.
-                result.trim_missed_tmp_files = true;
-                vlog(
-                  log.info,
-                  "trim: couldn't delete temp file {}: {}.",
-                  file_stat.path,
-                  e.what());
-            } else {
-                vlog(
-                  log.error,
-                  "trim: couldn't delete {}: {}.",
-                  file_stat.path,
-                  e.what());
-            }
         } catch (const std::exception& e) {
             vlog(
               log.error,
@@ -1419,15 +1410,26 @@ ss::future<> cache::_invalidate(const std::filesystem::path& key) {
 
 ss::future<space_reservation_guard>
 cache::reserve_space(uint64_t bytes, size_t objects) {
+    return reserve_space(bytes, objects, std::nullopt);
+}
+
+ss::future<space_reservation_guard> cache::reserve_space(
+  uint64_t bytes,
+  size_t objects,
+  std::optional<ss::lowres_clock::time_point> deadline) {
     while (_block_puts) {
         vlog(
           log.warn,
           "Blocking tiered storage cache write, disk space critically low.");
-        co_await _block_puts_cond.wait();
+        if (deadline) {
+            co_await _block_puts_cond.wait(*deadline);
+        } else {
+            co_await _block_puts_cond.wait();
+        }
     }
 
-    co_await container().invoke_on(0, [bytes, objects](cache& c) {
-        return c.do_reserve_space(bytes, objects);
+    co_await container().invoke_on(0, [bytes, objects, deadline](cache& c) {
+        return c.do_reserve_space(bytes, objects, deadline);
     });
 
     vlog(
@@ -1684,9 +1686,18 @@ void cache::maybe_background_trim() {
     }
 }
 
-ss::future<> cache::do_reserve_space(uint64_t bytes, size_t objects) {
+ss::future<> cache::do_reserve_space(
+  uint64_t bytes,
+  size_t objects,
+  std::optional<ss::lowres_clock::time_point> deadline) {
     vassert(ss::this_shard_id() == ss::shard_id{0}, "Only call on shard 0");
 
+    auto check_deadline = [&] {
+        if (deadline && ss::lowres_clock::now() >= *deadline) {
+            throw ss::timed_out_error();
+        }
+    };
+
     maybe_background_trim();
 
     if (may_reserve_space(bytes, objects)) {
@@ -1708,7 +1719,19 @@ ss::future<> cache::do_reserve_space(uint64_t bytes, size_t objects) {
       _reservations_pending,
       _reservations_pending_objects);
 
-    auto units = co_await ss::get_units(_cleanup_sm, 1);
+    check_deadline();
+
+    auto get_cleanup_units = [&]() {
+        if (deadline) {
+            auto now = ss::lowres_clock::now();
+            auto remaining = *deadline > now
+                               ? *deadline - now
+                               : ss::lowres_clock::duration::zero();
+            return ss::get_units(_cleanup_sm, 1, remaining);
+        }
+        return ss::get_units(_cleanup_sm, 1);
+    };
+    auto units = co_await get_cleanup_units();
 
     // Situation may change after a scheduling point. Another fiber could
     // trigger carryover trim which released some resources. Exit early in this
@@ -1780,6 +1803,8 @@ ss::future<> cache::do_reserve_space(uint64_t bytes, size_t objects) {
         _reservations_pending_objects += objects;
 
         while (!may_reserve_space(bytes, objects)) {
+            check_deadline();
+
             bool may_exceed = may_exceed_limits(bytes, objects)
                               && _last_trim_failed;
             bool may_trim_now = !get_trim_delay().has_value();
@@ -1799,7 +1824,8 @@ ss::future<> cache::do_reserve_space(uint64_t bytes, size_t objects) {
                 // After taking lock, there still isn't space: means someone
                 // else didn't take it and free space for us already, so we will
                 // do the trim.
-                co_await trim_throttled_unlocked();
+                co_await trim_throttled_unlocked(
+                  std::nullopt, std::nullopt, deadline);
                 did_trim = true;
             } else {
                 vlog(
@@ -2010,4 +2036,110 @@ cache::validate_cache_config(const config::configuration& conf) {
     return std::nullopt;
 }
 
+ss::future<std::filesystem::path>
+cache::create_staging_path(std::filesystem::path key) {
+    auto guard = _gate.hold();
+
+    auto key_path = _cache_dir / key;
+    auto filename = key_path.filename();
+    auto dir_path = key_path;
+    dir_path.remove_filename();
+
+    // TODO: share code with put().
+    auto tmp_filename = std::filesystem::path(
+      ss::format(
+        "{}_{}_{}{}",
+        filename.native(),
+        ss::this_shard_id(),
+        (++_cnt),
+        cache_tmp_file_extension));
+    auto tmp_filepath = dir_path / tmp_filename;
+    if (!co_await ss::file_exists(dir_path.string())) {
+        co_await ss::recursive_touch_directory(dir_path.string());
+    }
+
+    co_return tmp_filepath;
+}
+
+ss::future<staging_file> cache::create_staging_file(
+  std::filesystem::path key, staging_file_options opts) {
+    auto gate_holder = _gate.hold();
+    auto staging_path = co_await create_staging_path(key);
+    auto open_fut = co_await ss::coroutine::as_future(
+      ss::open_file_dma(
+        staging_path.native(),
+        ss::open_flags::create | ss::open_flags::rw
+          | ss::open_flags::exclusive));
+    if (open_fut.failed()) {
+        co_await ss::remove_file(staging_path.native())
+          .then_wrapped([](ss::future<> f) { f.ignore_ready_future(); });
+        std::rethrow_exception(open_fut.get_exception());
+    }
+    auto stream_fut = co_await ss::coroutine::as_future(
+      ss::make_file_output_stream(std::move(open_fut.get())));
+    if (stream_fut.failed()) {
+        co_await ss::remove_file(staging_path.native())
+          .then_wrapped([](ss::future<> f) { f.ignore_ready_future(); });
+        std::rethrow_exception(stream_fut.get_exception());
+    }
+    co_return staging_file{
+      *this,
+      std::move(gate_holder),
+      std::move(key),
+      std::move(staging_path),
+      std::move(stream_fut.get()),
+      opts};
+}
+
+ss::future<> cache::commit_staging_file(
+  std::filesystem::path key,
+  std::filesystem::path staging_path,
+  space_reservation_guard reservation) {
+    auto guard = _gate.hold();
+
+    auto dest_path = _cache_dir / key;
+    vlog(
+      log.debug, "Committing staging file {} to {}", staging_path, dest_path);
+
+    // We use link() rather than O_EXCL create + rename because a crash between
+    // the O_EXCL create and the rename would leave an empty file at the
+    // destination: on restart the cache would see a zero-byte entry for this
+    // key. With link(), a crash between link and unlink leaves the real data
+    // at the destination.
+    auto link_fut = co_await ss::coroutine::as_future(
+      ss::link_file(staging_path.native(), dest_path.native()));
+    if (link_fut.failed()) {
+        auto ex = link_fut.get_exception();
+        try {
+            std::rethrow_exception(ex);
+        } catch (const std::system_error& e) {
+            if (e.code() != std::errc::file_exists) {
+                throw;
+            }
+        }
+        // Someone else already wrote to the destination. Treat this as a
+        // success, but presume that the one who won the race already did the
+        // reservation accounting.
+        co_await ss::remove_file(staging_path.native());
+        co_return;
+    }
+    co_await ss::remove_file(staging_path.native());
+
+    auto file_size = co_await ss::file_size(dest_path.native());
+    reservation.wrote_data(file_size, 1);
+
+    auto source = dest_path.native();
+    if (ss::this_shard_id() == 0) {
+        _access_time_tracker.add(
+          source, std::chrono::system_clock::now(), file_size);
+    } else {
+        ssx::spawn_with_gate(_gate, [this, source, file_size] {
+            return container().invoke_on(0, [source, file_size](cache& c) {
+                c._access_time_tracker.add(
+                  source, std::chrono::system_clock::now(), file_size);
+            });
+        });
+    }
+}
+
 } // namespace cloud_io
diff --git a/src/v/cloud_io/cache_service.h b/src/v/cloud_io/cache_service.h
@@ -15,11 +15,13 @@
 #include "cloud_io/basic_cache_service_api.h"
 #include "cloud_io/cache_probe.h"
 #include "cloud_io/recursive_directory_walker.h"
+#include "cloud_io/staging_file.h"
 #include "config/configuration.h"
 #include "config/property.h"
 #include "ssx/semaphore.h"
 #include "storage/disk.h"
 
+#include <seastar/core/abort_source.hh>
 #include <seastar/core/condition-variable.hh>
 #include <seastar/core/future.hh>
 #include <seastar/core/gate.hh>
@@ -167,6 +169,24 @@ class cache
         return _cache_dir / key;
     }
 
+    /// Create a staging_file handle for the given cache key.
+    ss::future<staging_file>
+    create_staging_file(std::filesystem::path key, staging_file_options opts);
+
+    /// Commit a staging file into the cache. Atomically claims the
+    /// destination via link(), then removes the staging .part file and
+    /// updates the reservation to record actual usage.
+    ss::future<> commit_staging_file(
+      std::filesystem::path key,
+      std::filesystem::path staging_path,
+      space_reservation_guard reservation);
+
+    /// Reserve cache space with an optional deadline for blocking waits.
+    ss::future<space_reservation_guard> reserve_space(
+      uint64_t bytes,
+      size_t objects,
+      std::optional<ss::lowres_clock::time_point> deadline);
+
     // Checks if a cluster configuration is valid for the properties
     // `cloud_storage_cache_size` and `cloud_storage_cache_size_percent`.
     // Two cases are invalid: 1. the case in which both are 0, 2. the case in
@@ -178,6 +198,9 @@ class cache
     validate_cache_config(const config::configuration& conf);
 
 private:
+    ss::future<std::filesystem::path>
+    create_staging_path(std::filesystem::path key);
+
     /// Load access time tracker from file
     ss::future<> load_access_time_tracker();
 
@@ -230,10 +253,13 @@ class cache
     /// rate limit.
     std::optional<std::chrono::milliseconds> get_trim_delay() const;
 
-    /// Invoke trim, waiting if not enough time passed since the last trim
+    /// Invoke trim, waiting if not enough time passed since the last trim.
+    /// If a deadline is provided and the throttle delay would exceed it,
+    /// throws ss::timed_out_error rather than skipping the rate limit.
     ss::future<> trim_throttled_unlocked(
       std::optional<uint64_t> size_limit_override = std::nullopt,
-      std::optional<size_t> object_limit_override = std::nullopt);
+      std::optional<size_t> object_limit_override = std::nullopt,
+      std::optional<ss::lowres_clock::time_point> deadline = std::nullopt);
 
     // Take the cleanup semaphore before calling trim_throttled
     ss::future<> trim_throttled(
@@ -261,7 +287,8 @@ class cache
 
     /// Block until enough space is available to commit to a reservation
     /// (only runs on shard 0)
-    ss::future<> do_reserve_space(uint64_t, size_t);
+    ss::future<> do_reserve_space(
+      uint64_t, size_t, std::optional<ss::lowres_clock::time_point> deadline);
 
     /// Trim cache using results from the previous recursive directory walk
     ss::future<trim_result>