From 3adbb45170ff85b0afe8bf6303cc2806625fd6e9 Mon Sep 17 00:00:00 2001
From: Joaquin Anton Guirao <janton@nvidia.com>
Date: Tue, 4 Feb 2025 09:49:00 +0100
Subject: [PATCH] Ensure allocated temp memory is usable by nvImageCodec
 streams, as we are skipping pre-sync due to unnecessary overhead in the
 general case

Signed-off-by: Joaquin Anton Guirao <janton@nvidia.com>
---
 dali/operators/imgcodec/image_decoder.h | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)
diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h
index 129b198e65..efeb80c787 100644
--- a/dali/operators/imgcodec/image_decoder.h
+++ b/dali/operators/imgcodec/image_decoder.h
@@ -552,15 +552,6 @@ class ImageDecoder : public StatelessOperator<Backend> {
            MAKE_SEMANTIC_VERSION(req_major, req_minor, req_patch);
   }
 
-  /**
-   * @brief nvImageCodec up to 0.2 doesn't synchronize with the user stream before decoding.
-   * Because of that, we need to host synchronize before passing the async allocated buffer
-   * to the decoding function
-   */
-  bool need_host_sync_alloc() {
-    return !version_at_least(0, 3, 0);
-  }
-
   void PrepareOutput(SampleState &st, void *out_ptr, const ROI &roi, const Workspace &ws) {
     // Make a copy of the parsed img info. We might modify it
     // (for example, request planar vs. interleaved, etc)
@@ -794,7 +785,9 @@ class ImageDecoder : public StatelessOperator<Backend> {
     size_t nsamples_decode = batch_images_.size();
     size_t nsamples_cache = nsamples - nsamples_decode;
 
-    if (ws.has_stream() && need_host_sync_alloc() && any_need_processing) {
+    // Ensure allocated memory is usable by the decoder's internal streams,
+    // as we are intentionally skipping pre-sync to avoid slowing down the general case.
+    if (ws.has_stream() && any_need_processing) {
       DomainTimeRange tr("alloc sync", DomainTimeRange::kOrange);
       CUDA_CALL(cudaStreamSynchronize(ws.stream()));
     }