From cc24faa9c4d03f9a704b050abab53c2fba10b283 Mon Sep 17 00:00:00 2001 From: Michael Tuttle Date: Mon, 11 Nov 2024 13:41:01 -0800 Subject: [PATCH] Fix PCQ race condition Signed-off-by: Michael Tuttle --- .../onnx/src/QuantizeDequantizeUtils.hpp | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp b/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp index 7b2f066222a..f4f24f878f8 100644 --- a/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp +++ b/TrainingExtensions/onnx/src/QuantizeDequantizeUtils.hpp @@ -126,27 +126,29 @@ void quantizeDequantizePerChannel( } } - T encVec[4][channels]; + std::vector encVec(4 * channels); + for (int i = 0; i < channels; i++) { - encVec[0][i] = encodings[i]->min; - encVec[1][i] = encodings[i]->max; - encVec[2][i] = encodings[i]->delta; - encVec[3][i] = encodings[i]->offset; + encVec[i] = encodings[i]->min; + encVec[channels + i] = encodings[i]->max; + encVec[2 * channels + i] = encodings[i]->delta; + encVec[3 * channels + i] = encodings[i]->offset; } T* encodingVectorDevice; if (useCuda) { #ifdef ONNX_CUDA encodingVectorDevice = (T*) allocator->allocateRaw(4 * channels * sizeof(T)); - cudaMemcpy(encodingVectorDevice, encVec, 4 * channels * sizeof(T), cudaMemcpyHostToDevice); + cudaMemcpyAsync(encodingVectorDevice, encVec.data(), 4 * channels * sizeof(T), cudaMemcpyHostToDevice, + static_cast(stream)); #else throw std::runtime_error("Not compiled for GPU mode."); #endif } else { - encodingVectorDevice = (T*) encVec; + encodingVectorDevice = (T*) encVec.data(); } T* encodingMin = encodingVectorDevice;