first commit

Browse files

Files changed (13) hide show

.gitignore +4 -0
README.md +1 -0
build.toml +23 -0
flake.lock +168 -0
flake.nix +13 -0
residual_rms_rocm/residual_rms_dispatch.cu +63 -0
residual_rms_rocm/residual_rms_scalar.cu +74 -0
residual_rms_rocm/residual_rms_vectorized.cu +196 -0
residual_rms_rocm/utils.h +15 -0
torch-ext/residual_rms_rocm/__init__.py +3 -0
torch-ext/residual_rms_rocm/wrapped_rms.py +171 -0
torch-ext/torch_binding.cpp +12 -0
torch-ext/torch_binding.h +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+.venv/
+__pycache__/
+torch-ext/__pycache__/
+torch-ext/residual_rms_rocm/__pycache__/

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ RMSNorm kernel for ROCm devices from https://github.com/huggingface/hf-rocm-kernels

build.toml ADDED Viewed

	@@ -0,0 +1,23 @@

+[general]
+name = "residual_rms_rocm"
+universal = false
+[torch]
+src = [
+  "torch-ext/torch_binding.cpp",
+  "torch-ext/torch_binding.h",
+]
+[kernel.residual_rms_rocm]
+depends = ["torch"]
+backend = "rocm"
+rocm-archs = [
+    "gfx90a",
+]
+src = [
+    "residual_rms_rocm/residual_rms_dispatch.cu",
+    "residual_rms_rocm/residual_rms_scalar.cu",
+    "residual_rms_rocm/residual_rms_vectorized.cu",
+    "residual_rms_rocm/utils.h",
+]
+include = ["."]

flake.lock ADDED Viewed

	@@ -0,0 +1,168 @@

+{
+  "nodes": {
+    "flake-compat": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-compat_2": {
+      "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
+        "type": "github"
+      },
+      "original": {
+        "owner": "edolstra",
+        "repo": "flake-compat",
+        "type": "github"
+      }
+    },
+    "flake-utils": {
+      "inputs": {
+        "systems": "systems"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "hf-nix": {
+      "inputs": {
+        "flake-compat": "flake-compat_2",
+        "flake-utils": "flake-utils_2",
+        "nixpkgs": "nixpkgs"
+      },
+      "locked": {
+        "lastModified": 1757675377,
+        "narHash": "sha256-JQKZOI1ZYO4faJnanuoTXziSmqzXe5rEFSGliWDWqWw=",
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "rev": "faf3354403a7381958d08e826c15fe30f6986a4f",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "hf-nix",
+        "type": "github"
+      }
+    },
+    "kernel-builder": {
+      "inputs": {
+        "flake-compat": "flake-compat",
+        "flake-utils": "flake-utils",
+        "hf-nix": "hf-nix",
+        "nixpkgs": [
+          "kernel-builder",
+          "hf-nix",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1759322505,
+        "narHash": "sha256-RzjCEn0zDfdwQp4WAb0BBuLlHxypr+4+a4BMON23SNw=",
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "rev": "437d0f5c253a78d0be8b5998d9c1fcf32ac2360c",
+        "type": "github"
+      },
+      "original": {
+        "owner": "huggingface",
+        "repo": "kernel-builder",
+        "type": "github"
+      }
+    },
+    "nixpkgs": {
+      "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
+        "owner": "nixos",
+        "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nixos",
+        "ref": "nixos-unstable-small",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "root": {
+      "inputs": {
+        "kernel-builder": "kernel-builder"
+      }
+    },
+    "systems": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    }
+  },
+  "root": "root",
+  "version": 7
+}

flake.nix ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  description = "Flake for Torch kernel extension";
+  inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
+  };
+  outputs = { self, kernel-builder, }:
+    kernel-builder.lib.genFlakeOutputs {
+      path = ./.;
+      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
+    };
+}

residual_rms_rocm/residual_rms_dispatch.cu ADDED Viewed

	@@ -0,0 +1,63 @@

+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <hip/hip_runtime.h>
+#include "residual_rms_vectorized.cu"
+#include "residual_rms_scalar.cu"
+void residual_rms(torch::Tensor& input,         // Shape: [m, n] / Layout: row-major / Dtype: fp16
+                  torch::Tensor& residual,      // Shape: [m, n] / Layout: row-major / Dtype: fp16
+                  torch::Tensor& weight,        // Shape: [m,  ] / Layout: row-major / Dtype: fp16
+                  torch::Tensor& scale_tensor,  // Shape: [1,  ] / Layout: row-major / Dtype: fp32
+                  double epsilon,
+                  torch::Tensor& output,       // Shape: [m, n] / Layout: row-major / Dtype: fp8 or fp16
+                  torch::Tensor& next_buffer,  // Shape: [m, o] / Layout: dont-care / Dtype: fp16
+                  int64_t num_threads, bool force_scalar) {
+    // Retrieve shapes
+    const int rows = input.size(0);
+    const int cols = input.size(1);
+    // Activate device guard
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+    // Prepare kernel launch arguments
+    dim3 grid(rows);
+    dim3 block(num_threads);
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    // Check tensors alignment
+    bool vectorized_available = IS_16B_ALIGNED(input) && IS_16B_ALIGNED(residual) && IS_16B_ALIGNED(weight);
+    vectorized_available = vectorized_available && (!force_scalar) && (cols <= 16384);
+    // Case: output is fp16
+    if (output.dtype() == torch::kFloat16) {
+        vectorized_available = vectorized_available && IS_16B_ALIGNED(output);
+        if (vectorized_available) {
+            _residual_rms_vectorized<half2, false><<<grid, block, 0, stream>>>(
+                (half*)input.data_ptr(), (half*)residual.data_ptr(), (half*)weight.data_ptr(), (float*)NULL,
+                (half2*)output.data_ptr(), (half*)NULL, epsilon, cols, 0);
+        } else {
+            _residual_rms_scalar<half, false><<<grid, block, 0, stream>>>(
+                (half*)input.data_ptr(), (half*)residual.data_ptr(), (half*)weight.data_ptr(), (float*)NULL,
+                (half*)output.data_ptr(), (half*)NULL, epsilon, cols, 0);
+        }
+    }
+    // Case: output is fp8e3m4fnuz
+    else {
+        vectorized_available = vectorized_available && IS_8B_ALIGNED(output) && (next_buffer.size(1) % 8 == 0);
+        // Launch kernel
+        if (vectorized_available) {
+            _residual_rms_vectorized<__hip_fp8x2_storage_t, true><<<grid, block, 0, stream>>>(
+                (half*)input.data_ptr(), (half*)residual.data_ptr(), (half*)weight.data_ptr(),
+                (float*)scale_tensor.data_ptr(), (__hip_fp8x2_storage_t*)output.data_ptr(),
+                (half*)next_buffer.data_ptr(), epsilon, cols, next_buffer.size(1));
+        } else {
+            _residual_rms_scalar<__hip_fp8_storage_t, true><<<grid, block, 0, stream>>>(
+                (half*)input.data_ptr(), (half*)residual.data_ptr(), (half*)weight.data_ptr(),
+                (float*)scale_tensor.data_ptr(), (__hip_fp8_storage_t*)output.data_ptr(), (half*)next_buffer.data_ptr(),
+                epsilon, cols, next_buffer.size(1));
+        }
+    }
+}

residual_rms_rocm/residual_rms_scalar.cu ADDED Viewed

	@@ -0,0 +1,74 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils.h"
+template <typename T, bool clean_next_buffer>
+__global__ void _residual_rms_scalar(const half* __restrict__ input, half* __restrict__ residual,
+                                     const half* __restrict__ weight, const float* __restrict__ scale_tensor,
+                                     T* __restrict__ output, half* __restrict__ next_buffer, const float epsilon,
+                                     const int cols, const int buffer_cols) {
+    // Advance pointers according to the position of the thread in the grid
+    input += blockIdx.x * cols;
+    residual += blockIdx.x * cols;
+    output += blockIdx.x * cols;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    for (int i = threadIdx.x; i < cols; i += blockDim.x) {
+        half z = input[i];
+        z += residual[i];
+        float x = (float)z;
+        variance += (x * x);
+        residual[i] = z;
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Get inverse scale (only for fp8)
+    float inv_scale = 1.0f;
+    if constexpr (std::is_same_v<T, __hip_fp8_storage_t>) {
+        inv_scale = 1 / scale_tensor[0];
+    }
+    // Normalize and store
+    for (int idx = threadIdx.x; idx < cols; idx += blockDim.x) {
+        float x = (float)residual[idx];
+        half y = (half)(x * shared_normalizer);
+        y = (y * weight[idx]);
+        if constexpr (std::is_same_v<T, __hip_fp8_storage_t>) {
+            x = (float)y;
+            x *= inv_scale;
+            FP8_CLAMP(x, float);
+            output[idx] = __hip_cvt_float_to_fp8(x, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+        }
+        if constexpr (std::is_same_v<T, half>) {
+            output[idx] = y;
+        }
+    }
+    // Initialize next buffer
+    if constexpr (clean_next_buffer) {
+        next_buffer += blockIdx.x * buffer_cols;
+        for (int i = threadIdx.x; i < buffer_cols; i += blockDim.x) {
+            next_buffer[i] = 0;
+        }
+    }
+}

residual_rms_rocm/residual_rms_vectorized.cu ADDED Viewed

	@@ -0,0 +1,196 @@

+#include <torch/all.h>
+#include <hip/hip_bf16.h>
+#include <hip/hip_fp16.h>
+#include <hipcub/util_type.hpp>
+#include <hipcub/hipcub.hpp>
+#include <hip/hip_fp8.h>
+#include "utils.h"
+#define USE_SMEM true  // TODO: figure out if this is needed in practice
+template <typename T, bool clean_next_buffer>
+__global__ void _residual_rms_vectorized(const half* __restrict__ input, half* __restrict__ residual,
+                                         const half* __restrict__ weight, const float* __restrict__ scale_tensor,
+                                         T* __restrict__ output,  // half2 or __hip_fp8x2_storage_t
+                                         half* __restrict__ next_buffer, const float epsilon, const int cols,
+                                         const int buffer_cols) {
+    static constexpr int elems_per_load = 8;
+    static constexpr int smem_size = USE_SMEM ? 16384 : 0;
+    __shared__ half _smem[smem_size];
+    // Advance pointers according to the position of the thread in the grid
+    input += blockIdx.x * cols + elems_per_load * threadIdx.x;
+    residual += blockIdx.x * cols + elems_per_load * threadIdx.x;
+    weight += elems_per_load * threadIdx.x;
+    output += (blockIdx.x * cols + elems_per_load * threadIdx.x) / 2;
+    half* residual_start = residual;
+    half* residual_smem_buffer = &_smem[0] + elems_per_load * threadIdx.x;
+    // Residual connection: inplace add of input to residual, accumulate norm along the way
+    float variance = 0.0f;
+    float fp32_residual;
+    half input_buffer[elems_per_load];
+    half residual_buffer[elems_per_load];
+    const int loop_stride = elems_per_load * blockDim.x;
+    const int iterations = CDIV(cols - elems_per_load * threadIdx.x, loop_stride);
+    for (int i = 0; i < iterations; i++) {
+        // Load data using 128-bits loads
+#pragma unroll
+        for (int j = 0; j < elems_per_load; j++) {
+            input_buffer[j] = input[j];
+        }
+#pragma unroll
+        for (int j = 0; j < elems_per_load; j++) {
+            residual_buffer[j] = residual[j];
+        }
+        // Add everything in the residual buffer and accumulate variance
+#pragma unroll
+        for (int j = 0; j < elems_per_load; j++) {
+            residual_buffer[j] += input_buffer[j];
+            float float_res = (float)residual_buffer[j];
+            variance += float_res * float_res;
+        }
+// 128-bits smem store
+#pragma unroll
+        for (int j = 0; j < elems_per_load; j++) {
+            if constexpr (USE_SMEM) {
+                residual_smem_buffer[j] = residual_buffer[j];
+            } else {
+                residual[j] = residual_buffer[j];
+            }
+        }
+        // Advance pointers
+        input += loop_stride;
+        residual += loop_stride;
+        residual_smem_buffer += loop_stride;
+    }
+    variance /= cols;
+    // Block reduce to compute the total norm
+    __shared__ float shared_normalizer;
+    using BlockReduce = hipcub::BlockReduce<float, 1024>;
+    __shared__ typename BlockReduce::TempStorage reduceStore;
+    variance = BlockReduce(reduceStore).Reduce(variance, hipcub::Sum{}, blockDim.x);
+    if (threadIdx.x == 0) {
+        shared_normalizer = rsqrtf(variance + epsilon);
+    }
+    __syncthreads();
+    // Normalize and convert
+    __half2 weight_buffer[elems_per_load / 2];
+    T output_buffer[elems_per_load / 2];
+    // Apply inverse scale (only for fp8)
+    if constexpr (std::is_same_v<T, __hip_fp8x2_storage_t>) {
+        shared_normalizer = shared_normalizer / scale_tensor[0];
+    }
+    residual = residual_start;
+    residual_smem_buffer = &_smem[0] + elems_per_load * threadIdx.x;
+    for (int i = 0; i < iterations; i++) {
+// 128-bits loads
+#pragma unroll
+        for (int j = 0; j < elems_per_load; j++) {
+            if constexpr (USE_SMEM) {
+                residual_buffer[j] = residual_smem_buffer[j];
+            } else {
+                residual_buffer[j] = residual[j];
+            }
+        }
+#pragma unroll
+        for (int j = 0; j < elems_per_load / 2; j++) {
+            weight_buffer[j] = reinterpret_cast<const __half2*>(weight)[j];
+        }
+// 128b store
+#pragma unroll
+        for (int j = 0; j < elems_per_load; j++) {
+            residual[j] = residual_buffer[j];
+        }
+// Compute and fill buffer
+#pragma unroll
+        for (int j = 0; j < elems_per_load / 2; j++) {
+            // Output is fp8
+            if constexpr (std::is_same_v<T, __hip_fp8x2_storage_t>) {
+                __half2 tmp_res = {residual_buffer[2 * j], residual_buffer[2 * j + 1]};
+                // tmp_res = tmp_res * weight_buffer[j];
+                float2 tmp_float2 = __half22float2(tmp_res);
+                // INCREASES PRECISION | TODO: figure out a better test
+                tmp_float2 = tmp_float2 * __half22float2(weight_buffer[j]);
+                tmp_float2 *= shared_normalizer;
+                // tmp_float2.x = __builtin_amdgcn_fmed3f(tmp_float2.x, 448.0, -448.0); // TODO: are they needed?
+                // tmp_float2.y = __builtin_amdgcn_fmed3f(tmp_float2.y, 448.0, -448.0); // TODO: are they needed?
+                output_buffer[j] = __hip_cvt_float2_to_fp8x2(tmp_float2, __HIP_SATFINITE, __HIP_E4M3_FNUZ);
+            }
+            // Output is fp16
+            if constexpr (std::is_same_v<T, half2>) {
+                float2 tmp_float2;
+                tmp_float2.x = (float)residual_buffer[2 * j];
+                tmp_float2.y = (float)residual_buffer[2 * j + 1];
+                tmp_float2 *= shared_normalizer;
+                half2 tmp = {(half)tmp_float2.x, (half)tmp_float2.y};
+                tmp *= reinterpret_cast<const half2*>(weight_buffer)[j];
+                output_buffer[j] = tmp;
+            }
+        }
+// 64b store
+#pragma unroll
+        for (int j = 0; j < elems_per_load / 2; j++) {
+            output[j] = output_buffer[j];
+        }
+        // Advance pointers
+        residual += loop_stride;
+        residual_smem_buffer += loop_stride;
+        weight += loop_stride;
+        output += loop_stride / 2;
+    }
+    // Initialize next buffer TODO: add this as a template (eventualy w/ vector granularity)
+    if constexpr (clean_next_buffer) {
+        next_buffer += blockIdx.x * buffer_cols;
+        for (int i = elems_per_load * threadIdx.x; i < buffer_cols; i += elems_per_load * blockDim.x) {
+#pragma unroll
+            for (int j = 0; j < elems_per_load; j++) {
+                next_buffer[i + j] = 0;
+            }
+        }
+    }
+}
+//   Nb. rows    Ref (μs)    Pointwise (μs)    Vectorized (μs)
+// ----------  ----------  ----------------  -----------------
+//          1     40.6864           10.4857            4.8905
+//          2     42.8676           10.5499            5.04421
+//          4     43.7978           10.5729            5.05962
+//          8     44.0237           10.6909            5.10061
+//         16     47.1026           10.7823            5.19516
+//         32     56.3393           11.0192            5.45101
+//         64     74.0383           14.0895            5.86153
+//        128     98.3725           15.2012            6.59527
+//        256    119.426            27.7393           11.5191
+//   Nb. rows    Ref (μs)    Pointwise (μs)    Vectorized (μs)
+// ----------  ----------  ----------------  -----------------
+//          1     38.8908           10.5276            4.28524
+//          2     42.8806           10.5337            4.30209
+//          4     43.2694           10.6618            4.38874
+//          8     43.4718           10.6979            4.41091
+//         16     46.6662           10.8013            4.49634
+//         32     55.7943           11.0203            4.78883
+//         64     75.1326           14.1084            5.38017
+//        128    100                15.129             6.31691
+//        256    118.571            27.3881           11.1394

residual_rms_rocm/utils.h ADDED Viewed

	@@ -0,0 +1,15 @@

+#pragma once
+#define WARPSIZE 64
+#define FP8_CLAMP(x, type)                   \
+    x = (x > (type)448.0) ? (type)448.0 : x; \
+    x = (x < (type) - 448.0) ? (type) - 448.0 : x;
+// TODO: reformat clamping
+#define IS_8B_ALIGNED(tensor) (reinterpret_cast<std::uintptr_t>(tensor.data_ptr()) % 4 == 0)
+#define IS_16B_ALIGNED(tensor) (reinterpret_cast<std::uintptr_t>(tensor.data_ptr()) % 16 == 0)
+#define CDIV(a, b) ((a + b - 1) / (b))
+#define FP8_MAX 224.0f  // TODO: check if this or 448.0f

torch-ext/residual_rms_rocm/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .wrapped_rms import residual_rms, reference_residual_rms
2	+
3	+ __all__ = ["residual_rms", "reference_residual_rms"]

torch-ext/residual_rms_rocm/wrapped_rms.py ADDED Viewed

	@@ -0,0 +1,171 @@

+from typing import Tuple, Optional
+import torch
+from torch import Tensor
+from ._ops import ops
+_HIGHEST_RESIDUAL_RMS_MODE = 3
+def residual_rms_checks(
+    input: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    scale_tensor: Tensor,
+    epsilon: float,
+    next_buffer: Tensor,
+) -> None:
+    # Check shapes
+    assert input.dim() == 2, f"Expected input to have 2 dimensions but got {input.dim() = } instead."
+    assert residual.shape == input.shape, \
+        f"Expected input and residual to have same shape but got {input.shape = } and {residual.shape = }"
+    assert weight.shape == (input.size(1), ), \
+        f"Expected weight to have shape {(input.size(1), ) = } but got {weight.shape = }"
+    # Check devices
+    device = input.device
+    assert device.type == "cuda", f"Expected input.device to be of type cuda, but got {device.type = } instead."
+    assert residual.device == device, f"Expected {residual.device = } to be the same as {input.device = }"
+    if scale_tensor is not None:
+        assert scale_tensor.device == device, f"Expected {scale_tensor.device = } to be the same as {input.device = }"
+    assert next_buffer.device == device, f"Expected {next_buffer.device = } to be the same as {input.device = }"
+    # Check layouts
+    assert input.is_contiguous(), f"Expected input to be contiguous but got {input.stride() = }"
+    assert residual.is_contiguous(), f"Expected residual to be contiguous but got {residual.stride() = }"
+    # Check scalars
+    assert epsilon > 0, f"Expected RMS epsilon to be > 0 to avoid division by zero, but got {epsilon = }"
+def residual_rms_choose_mode(
+    input: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    next_buffer: Tensor,
+    mode: int,
+) -> int:
+    cols_is_multiple_of_8 = (input.size(1) % 8 == 0) and (next_buffer.size(1) % 8 == 0)
+    tensors_are_16b_aligned = all([x.data_ptr() % 16 == 0 for x in [input, residual, weight]])
+    if mode == -1:
+        mode = _HIGHEST_RESIDUAL_RMS_MODE if (tensors_are_16b_aligned and cols_is_multiple_of_8) else 0
+    elif mode > 0:
+        assert tensors_are_16b_aligned, (
+            f"Requested a {mode = } > 0 requires tensors to be 16 bits aligned but got {input.data_ptr() % 16 = }, "
+            f"{residual.data_ptr() % 16 = }, {weight.data_ptr() % 16 = }"
+        )
+        assert cols_is_multiple_of_8, f"Requested {mode = } requires {input.size(1) = } to be a multiple of 8."
+    return mode
+def infer_num_threads(rows: int, num_threads: int) -> int:
+    # Error case
+    if num_threads < 0 or num_threads > 1024:
+        raise ValueError(f"{num_threads = } is not between 0 and 1024")
+    # Case: num_threads was specified
+    elif num_threads != 0:
+        return num_threads
+    # Otherwise, we branch upon the number of rows
+    if rows <= 16:
+        return 1024
+    if rows <= 32:
+        return 768
+    if rows <= 64:
+        return 1024
+    if rows <= 256:
+        return 960
+    return 1024
+## Main kernel
+def residual_rms(
+    input: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    epsilon: float,
+    scale_tensor: Optional[Tensor] = None,
+    next_buffer: Optional[Tensor] = None,
+    num_threads: int = 0,
+    force_scalar: bool = False,
+) -> Tuple[Tensor, Tensor]:
+    """Kernel that fuses a residual connection, an RMS normalization and a conversion to fp8. The resdiual argument is
+    modified inplace (residual <- input + residual).
+    Args:
+        - input: a fp16 tensor of shape (rows, cols) in row-major format
+        - residual: a fp16 tensor of shape (rows, cols) in row-major format
+        - weight: a fp16 tensor of shape (cols, ) in row-major format which contains the weight of the RMS norm
+        - epsilon: the small epsilon used inside the RMS norm to avoid division by zero
+        - scale_tensor: a fp32 one-item tensor to divide the output of the RMS norm before their conversion to fp8. If
+            set to None, then the output dtype is fp16
+        - next_buffer: an optional tensor of shape (rows, .) to initialize to zero if the output dtype in fp8
+        - num_threads: the number of threads per block in the kernel. Default value is 0, which then defaults to 1024
+    Outputs:
+        - an fp8 tensor of shape (rows, cols) in row-major format
+        - the residual modified in place
+    """
+    if next_buffer is None:
+        next_buffer = torch.empty(size=(input.size(0), 0), device=input.device, dtype=torch.float16)
+    residual_rms_checks(input, residual, weight, scale_tensor, epsilon, next_buffer)
+    num_threads = infer_num_threads(input.size(0), num_threads)
+    if scale_tensor is not None:
+        output = torch.empty(size=input.shape, dtype=torch.float8_e4m3fnuz, device=input.device)
+    else:
+        # TODO: here, we could use input as the output tensor
+        output = torch.empty(size=input.shape, dtype=torch.float16, device=input.device)
+    ops.residual_rms(
+        input=input,
+        residual=residual,
+        weight=weight,
+        scale_tensor=scale_tensor,
+        epsilon=epsilon,
+        output=output,
+        next_buffer=next_buffer,
+        num_threads=num_threads,
+        force_scalar=force_scalar,
+    )
+    return output, residual
+## Reference implementation
+def fp8_quantize(
+    x_full_precision: Tensor,
+    scale: Tensor,
+) -> Tuple[Tensor, Tensor]:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    x_quantized = (x_full_precision * scale.reciprocal()).clamp(min=finfo.min, max=finfo.max)
+    x_quantized = x_quantized.to(torch.float8_e4m3fn)
+    weight_as_int8 = x_quantized.view(torch.int8)
+    ROCM_FP8_NAN_AS_INT = -128
+    mask = weight_as_int8 == ROCM_FP8_NAN_AS_INT
+    weight_as_int8[mask] = 0
+    x_quantized = weight_as_int8.view(torch.float8_e4m3fnuz)
+    return x_quantized, scale * 2.0
+def reference_residual_rms(
+    input: Tensor,
+    residual: Tensor,
+    weight: Tensor,
+    epsilon: float,
+    scale_tensor: Optional[Tensor],
+    next_buffer: Optional[Tensor] = None,
+) -> Tuple[Tensor, Tensor, float]:
+    """Reference for the residual_rms operation. Check its docstring for more details, the only difference here is that
+    the scale needs to be passed a tensor and not a float."""
+    assert input.dtype == torch.float16, f"Expected torch.float16 but got {input.dtype = }"
+    assert residual.dtype == torch.float16, f"Expected torch.float16 but got {residual.dtype = }"
+    input += residual
+    residual = input
+    input = reference_rms(input, epsilon)
+    if weight.dtype in [torch.float16, torch.bfloat16]:
+        input = input.to(weight.dtype)
+    input = weight * input
+    if scale_tensor is not None:
+        qinput, scale_tensor = fp8_quantize(input, scale_tensor)
+        if next_buffer is not None:
+            next_buffer.fill_(0)
+    else:
+        qinput = input
+    return qinput, residual, scale_tensor
+def reference_rms(x: Tensor, eps: float) -> Tensor:
+    x = x.to(torch.float32)
+    variance = x.pow(2).mean(-1, keepdim=True)
+    return x * torch.rsqrt(variance + eps)

torch-ext/torch_binding.cpp ADDED Viewed

	@@ -0,0 +1,12 @@

+#include <torch/library.h>
+#include "registration.h"
+#include "torch_binding.h"
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+    ops.def("residual_rms(Tensor input, Tensor residual, Tensor weight, Tensor scale_tensor, double epsilon, Tensor! output, Tensor next_buffer, int64_t num_threads, bool force_scalar) -> ()");
+    ops.impl("residual_rms", torch::kCUDA, &residual_rms);
+}
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)

torch-ext/torch_binding.h ADDED Viewed

	@@ -0,0 +1,5 @@

+#pragma once
+#include <torch/torch.h>
+void residual_rms(torch::Tensor& input, torch::Tensor& residual, torch::Tensor& weight, torch::Tensor& scale_tensor, double epsilon, torch::Tensor& output, torch::Tensor& next_buffer, int64_t num_threads, bool force_scalar);