llvm · bogner · Dec 20, 2024
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
@@ -491,26 +491,28 @@ Examples:
            i32 %byte_offset,
            i32 0)
 
-Texture and Typed Buffer Stores
--------------------------------
+Stores
+------
 
-*relevant types: Textures and TypedBuffer*
+*relevant types: Textures and Buffer*
 
-The `TextureStore`_ and `BufferStore`_ DXIL operations always write all four
-32-bit components to a texture or a typed buffer. While both operations include
-a mask parameter, it is specified that the mask must cover all components when
-used with these types.
+The `TextureStore`_, `BufferStore`_, and `RawBufferStore`_ DXIL operations
+write four components to a texture or a buffer. These include a mask argument
+that is used when fewer than 4 components are written, but notably this only
+takes on the contiguous x, xy, xyz, and xyzw values.
 
-The store operations that we define as intrinsics behave similarly, and will
-only accept writes to the whole of the contained type. This differs from the
-loads above, but this makes sense to do from a semantics preserving point of
-view. Thus, texture and buffer stores may only operate on 4-element vectors of
-types that are 32-bits or fewer, such as ``<4 x i32>``, ``<4 x float>``, and
-``<4 x half>``, and 2 element vectors of 64-bit types like ``<2 x double>`` and
-``<2 x i64>``.
+We define the LLVM store intrinsics to accept vectors when storing multiple
+components rather than using `undef` and a mask, but otherwise match the DXIL
+ops fairly closely.
 
-.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
 .. _TextureStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#texturestore
+.. _BufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#bufferstore
+.. _RawBufferStore: https://github.com/microsoft/DirectXShaderCompiler/blob/main/docs/DXIL.rst#rawbufferstore
+
+For TypedBuffer, we only need one coordinate, and we must always write a vector
+since partial writes aren't possible. Similarly to the load operations
+described above, we handle 64-bit types specially and only handle 2-element
+vectors rather than 4.
 
 Examples:
 
@@ -548,3 +550,85 @@ Examples:
        target("dx.TypedBuffer", f16, 1, 0) %buf, i32 %index, <4 x f16> %data)
    call void @llvm.dx.resource.store.typedbuffer.tdx.Buffer_v2f64_1_0_0t(
        target("dx.TypedBuffer", f64, 1, 0) %buf, i32 %index, <2 x f64> %data)
+
+For RawBuffer, we need two indices and we accept scalars and vectors of less
+than 4 elements. Note that we do allow vectors of 4 64-bit elements here.
+
+Examples:
+
+.. list-table:: ``@llvm.dx.resource.store.rawbuffer``
+   :header-rows: 1
+
+   * - Argument
+     -
+     - Type
+     - Description
+   * - Return value
+     -
+     - ``void``
+     -
+   * - ``%buffer``
+     - 0
+     - ``target(dx.TypedBuffer, ...)``
+     - The buffer to store into
+   * - ``%index``
+     - 1
+     - ``i32``
+     - Index into the buffer
+   * - ``%offset``
+     - 2
+     - ``i32``
+     - Byte offset into structured buffer elements
+   * - ``%data``
+     - 3
+     - Scalar or vector
+     - The data to store
+
+Examples:
+
+.. code-block:: llvm
+
+   ; float
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_f32_1_0_0t.f32(
+       target("dx.RawBuffer", float, 1, 0, 0) %buffer,
+       i32 %index, i32 %offset, float %data)
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.f32(
+       target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+       i32 %index, i32 0, float %data)
+
+   ; float4
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_v4f32_1_0_0t.v4f32(
+       target("dx.RawBuffer", <4 x float>, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x float> %data)
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f32(
+       target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x float> %data)
+
+   ; struct S0 { float4 f; int4 i; }
+   call void @llvm.dx.resource.store.rawbuffer.v4f32(
+       target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer,
+       i32 %index, i32 0, <4 x float> %data0)
+   call void @llvm.dx.resource.store.rawbuffer.v4i32(
+       target("dx.RawBuffer", { <4 x float>, <4 x i32> }, 1, 0, 0) %buffer,
+       i32 %index, i32 16, <4 x i32> %data1)
+
+   ; struct Q { float4 f; int3 i; }
+   ; struct R { int z; S x; }
+   call void @llvm.dx.resource.store.rawbuffer.i32(
+       target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+           %buffer,
+       i32 %index, i32 0, i32 %data0)
+   call void @llvm.dx.resource.store.rawbuffer.v4f32(
+       target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+           %buffer,
+       i32 %index, i32 4, <4 x float> %data1)
+   call void @llvm.dx.resource.store.rawbuffer.v3f16(
+       target("dx.RawBuffer", {i32, {<4 x float>, <3 x half>}}, 1, 0, 0)
+           %buffer,
+       i32 %index, i32 20, <3 x half> %data2)
+
+   ; byteaddressbuf.Store<int64_t4>
+   call void @llvm.dx.resource.store.rawbuffer.tdx.RawBuffer_i8_1_0_0t.v4f64(
+       target("dx.RawBuffer", i8, 1, 0, 0) %buffer,
+       i32 %offset, i32 0, <4 x double> %data)
+
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -40,6 +40,10 @@ def int_dx_resource_load_rawbuffer
     : DefaultAttrsIntrinsic<[llvm_any_ty, llvm_i1_ty],
                             [llvm_any_ty, llvm_i32_ty, llvm_i32_ty],
                             [IntrReadMem]>;
+def int_dx_resource_store_rawbuffer
+    : DefaultAttrsIntrinsic<
+          [], [llvm_any_ty, llvm_i32_ty, llvm_i32_ty, llvm_any_ty],
+          [IntrWriteMem]>;
 
 def int_dx_resource_updatecounter
     : DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_any_ty, llvm_i8_ty],

diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
@@ -909,6 +909,26 @@ def RawBufferLoad : DXILOp<139, rawBufferLoad> {
   let stages = [Stages<DXIL1_2, [all_stages]>];
 }
 
+def RawBufferStore : DXILOp<140, rawBufferStore> {
+  let Doc = "writes to a RWByteAddressBuffer or RWStructuredBuffer";
+  // Handle, Coord0, Coord1, Val0, Val1, Val2, Val3, Mask, Alignment
+  let arguments = [
+    HandleTy, Int32Ty, Int32Ty, OverloadTy, OverloadTy, OverloadTy, OverloadTy,
+    Int8Ty, Int32Ty
+  ];
+  let result = VoidTy;
+  let overloads = [
+    Overloads<DXIL1_2,
+              [ResRetHalfTy, ResRetFloatTy, ResRetInt16Ty, ResRetInt32Ty]>,
+    Overloads<DXIL1_3,
+              [
+                ResRetHalfTy, ResRetFloatTy, ResRetDoubleTy, ResRetInt16Ty,
+                ResRetInt32Ty, ResRetInt64Ty
+              ]>
+  ];
+  let stages = [Stages<DXIL1_2, [all_stages]>];
+}
+
 def Dot4AddI8Packed : DXILOp<163, dot4AddPacked> {
   let Doc = "signed dot product of 4 x i8 vectors packed into i32, with "
             "accumulate to i32";

diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -616,7 +616,10 @@ class OpLowerer {
     return false;
   }
 
-  [[nodiscard]] bool lowerTypedBufferStore(Function &F) {
+  [[nodiscard]] bool lowerBufferStore(Function &F, bool IsRaw) {
+    Triple TT(Triple(M.getTargetTriple()));
+    VersionTuple DXILVersion = TT.getDXILVersion();
+    const DataLayout &DL = F.getDataLayout();
     IRBuilder<> &IRB = OpBuilder.getIRB();
     Type *Int8Ty = IRB.getInt8Ty();
     Type *Int32Ty = IRB.getInt32Ty();
@@ -627,51 +630,75 @@ class OpLowerer {
       Value *Handle =
           createTmpHandleCast(CI->getArgOperand(0), OpBuilder.getHandleType());
       Value *Index0 = CI->getArgOperand(1);
-      Value *Index1 = UndefValue::get(Int32Ty);
-      // For typed stores, the mask must always cover all four elements.
-      Constant *Mask = ConstantInt::get(Int8Ty, 0xF);
+      Value *Index1 = IsRaw ? CI->getArgOperand(2) : UndefValue::get(Int32Ty);
+
+      Value *Data = CI->getArgOperand(IsRaw ? 3 : 2);
+      Type *DataTy = Data->getType();
+      Type *ScalarTy = DataTy->getScalarType();
 
-      Value *Data = CI->getArgOperand(2);
-      auto *DataTy = dyn_cast<FixedVectorType>(Data->getType());
-      if (!DataTy || DataTy->getNumElements() != 4)
+      uint64_t NumElements =
+          DL.getTypeSizeInBits(DataTy) / DL.getTypeSizeInBits(ScalarTy);
+      Value *Mask = ConstantInt::get(Int8Ty, ~(~0U << NumElements));
+
+      // TODO: check that we only have vector or scalar...
+      if (!IsRaw && NumElements != 4)
         return make_error<StringError>(
             "typedBufferStore data must be a vector of 4 elements",
             inconvertibleErrorCode());
+      else if (NumElements > 4)
+        return make_error<StringError>(
+            "rawBufferStore data must have at most 4 elements",
+            inconvertibleErrorCode());
 
-      // Since we're post-scalarizer, we likely have a vector that's constructed
-      // solely for the argument of the store. If so, just use the scalar values
-      // from before they're inserted into the temporary.
       std::array<Value *, 4> DataElements{nullptr, nullptr, nullptr, nullptr};
-      auto *IEI = dyn_cast<InsertElementInst>(Data);
-      while (IEI) {
-        auto *IndexOp = dyn_cast<ConstantInt>(IEI->getOperand(2));
-        if (!IndexOp)
-          break;
-        size_t IndexVal = IndexOp->getZExtValue();
-        assert(IndexVal < 4 && "Too many elements for buffer store");
-        DataElements[IndexVal] = IEI->getOperand(1);
-        IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
+      if (DataTy == ScalarTy)
+        DataElements[0] = Data;
+      else {
+        // Since we're post-scalarizer, if we see a vector here it's likely
+        // constructed solely for the argument of the store. Just use the scalar
+        // values from before they're inserted into the temporary.
+        auto *IEI = dyn_cast<InsertElementInst>(Data);
+        while (IEI) {
+          auto *IndexOp = dyn_cast<ConstantInt>(IEI->getOperand(2));
+          if (!IndexOp)
+            break;
+          size_t IndexVal = IndexOp->getZExtValue();
+          assert(IndexVal < 4 && "Too many elements for buffer store");
+          DataElements[IndexVal] = IEI->getOperand(1);
+          IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
+        }
       }
 
       // If for some reason we weren't able to forward the arguments from the
-      // scalarizer artifact, then we need to actually extract elements from the
-      // vector.
-      for (int I = 0, E = 4; I != E; ++I)
+      // scalarizer artifact, then we may need to actually extract elements from
+      // the vector.
+      for (int I = 0, E = NumElements; I < E; ++I)
         if (DataElements[I] == nullptr)
           DataElements[I] =
               IRB.CreateExtractElement(Data, ConstantInt::get(Int32Ty, I));
+      // For any elements beyond the length of the vector, fill up with undef.
+      for (int I = NumElements, E = 4; I < E; ++I)
+        if (DataElements[I] == nullptr)
+          DataElements[I] = UndefValue::get(ScalarTy);
 
-      std::array<Value *, 8> Args{
+      dxil::OpCode Op = OpCode::BufferStore;
+      SmallVector<Value *, 9> Args{
           Handle,          Index0,          Index1,          DataElements[0],
           DataElements[1], DataElements[2], DataElements[3], Mask};
+      if (IsRaw && DXILVersion >= VersionTuple(1, 2)) {
+        Op = OpCode::RawBufferStore;
+        // RawBufferStore requires the alignment
+        Args.push_back(
+            ConstantInt::get(Int32Ty, DL.getPrefTypeAlign(ScalarTy).value()));
+      }
       Expected<CallInst *> OpCall =
-          OpBuilder.tryCreateOp(OpCode::BufferStore, Args, CI->getName());
+          OpBuilder.tryCreateOp(Op, Args, CI->getName());
       if (Error E = OpCall.takeError())
         return E;
 
       CI->eraseFromParent();
       // Clean up any leftover `insertelement`s
-      IEI = dyn_cast<InsertElementInst>(Data);
+      auto *IEI = dyn_cast<InsertElementInst>(Data);
       while (IEI && IEI->use_empty()) {
         InsertElementInst *Tmp = IEI;
         IEI = dyn_cast<InsertElementInst>(IEI->getOperand(0));
@@ -776,11 +803,14 @@ class OpLowerer {
         HasErrors |= lowerTypedBufferLoad(F, /*HasCheckBit=*/true);
         break;
       case Intrinsic::dx_resource_store_typedbuffer:
-        HasErrors |= lowerTypedBufferStore(F);
+        HasErrors |= lowerBufferStore(F, /*IsRaw=*/false);
         break;
       case Intrinsic::dx_resource_load_rawbuffer:
         HasErrors |= lowerRawBufferLoad(F);
         break;
+      case Intrinsic::dx_resource_store_rawbuffer:
+        HasErrors |= lowerBufferStore(F, /*IsRaw=*/true);
+        break;
       case Intrinsic::dx_resource_updatecounter:
         HasErrors |= lowerUpdateCounter(F);
         break;