The kernel source is now on GitHub

Browse files

Files changed (12) hide show

README.md +5 -2
build.toml +0 -19
flake.lock +0 -168
flake.nix +0 -11
rotary-xpu/rotary_xpu.cpp +0 -40
rotary-xpu/rotary_xpu.hpp +0 -375
rotary/rotary_cuda.cu +0 -45
tests/__init__.py +0 -0
tests/test_rotary.py +0 -130
tests/utils.py +0 -23
torch-ext/rotary/__init__.py +0 -19
torch-ext/torch_binding.cpp +0 -54

README.md CHANGED Viewed

@@ -1,11 +1,14 @@
 ---
 license: bsd-3-clause
 tags:
-- kernel
 ---
 ![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/rotary)
 ## rotary
-rotary embedding kernel from [Flash Attention](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary).

 ---
 license: bsd-3-clause
 tags:
+  - kernel
 ---
 ![Status](https://hubwebhook.dholtz.com/shield?repo=kernels-community/rotary)
 ## rotary
+rotary embedding kernel from [Flash Attention](https://github.com/Dao-AILab/flash-attention/tree/main/csrc/rotary).
+Kernel source: https://github.com/huggingface/kernels-community/tree/main/rotary

build.toml DELETED Viewed

@@ -1,19 +0,0 @@
-[general]
-name = "rotary"
-universal = false
-[torch]
-src = ["torch-ext/torch_binding.cpp"]
-[kernel.activation]
-backend = "cuda"
-depends = ["torch"]
-src = ["rotary/rotary_cuda.cu"]
-[kernel.rotary_xpu]
-backend = "xpu"
-depends = ["torch"]
-src = [
-    "rotary-xpu/rotary_xpu.cpp",
-    "rotary-xpu/rotary_xpu.hpp",
-    ]

flake.lock DELETED Viewed

@@ -1,168 +0,0 @@
-{
-  "nodes": {
-    "flake-compat": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-compat_2": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_2": {
-      "inputs": {
-        "systems": "systems_2"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "hf-nix": {
-      "inputs": {
-        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_2",
-        "nixpkgs": "nixpkgs"
-      },
-      "locked": {
-        "lastModified": 1759493343,
-        "narHash": "sha256-8fhl0gwMAnOkQbogPIVq+Fha+Yeq52FaRXfwF+F9Q+k=",
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "rev": "b1fc3a18b52447a0f24bc6884418edc5e66082b9",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "type": "github"
-      }
-    },
-    "kernel-builder": {
-      "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "hf-nix": "hf-nix",
-        "nixpkgs": [
-          "kernel-builder",
-          "hf-nix",
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1759501552,
-        "narHash": "sha256-Wnrw3l22y9jdL4C9TGxznIB4qiQznWLtU9ykCbK49EE=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "ed5722d95d9395fbc7d0239a97208f2b04147dfa",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1755963616,
-        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nixos",
-        "ref": "nixos-unstable-small",
-        "repo": "nixpkgs",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "kernel-builder": "kernel-builder"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    },
-    "systems_2": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}

flake.nix DELETED Viewed

@@ -1,11 +0,0 @@
-{
-  description = "Flake for Torch kernel extension";
-  inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder";
-  };
-  outputs = { self, kernel-builder, }:
-    kernel-builder.lib.genFlakeOutputs {
-      inherit self;
-      path = ./.;
-    };
-}

rotary-xpu/rotary_xpu.cpp DELETED Viewed

@@ -1,40 +0,0 @@
-#include <torch/all.h>
-#include "rotary_xpu.hpp"
-void _apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
-                       torch::Tensor const &cos, torch::Tensor const &sin,
-                       torch::Tensor &out1, torch::Tensor &out2,
-                       bool const conj) {
-    auto iter = at::TensorIteratorConfig()
-        .add_output(out1)
-        .add_output(out2)
-        .add_input(x1)
-        .add_input(x2)
-        .add_input(cos)
-        .add_input(sin)
-        .check_all_same_dtype(false)
-        .promote_inputs_to_common_dtype(false)
-        .build();
-    if (!conj) {
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel_xpu", [&] {
-            gpu_kernel_multiple_outputs(
-                iter, [] (scalar_t x1, scalar_t x2, scalar_t cos,
-                                    scalar_t sin) -> std::tuple<scalar_t, scalar_t> {
-                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
-                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
-                return {out1, out2};
-            });
-        });
-    } else {
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel_xpu", [&] {
-            gpu_kernel_multiple_outputs(
-                iter, [] (scalar_t x1, scalar_t x2, scalar_t cos,
-                                    scalar_t sin) -> std::tuple<scalar_t, scalar_t> {
-                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
-                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
-                return {out1, out2};
-            });
-        });
-    }
-}

rotary-xpu/rotary_xpu.hpp DELETED Viewed

@@ -1,375 +0,0 @@
-#include <ATen/core/TensorBody.h>
-#include <ATen/detail/FunctionTraits.h>
-#include <ATen/native/TensorIterator.h>
-#include <sycl/sycl.hpp>
-#include <ATen/core/Array.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/Exception.h>
-#include <c10/util/TypeCast.h>
-#include <cstdint>
-#include <type_traits>
-#include <array>
-#include <c10/core/ScalarType.h>
-#include <c10/xpu/XPUStream.h>
-#include <ATen/xpu/XPUContext.h>
-constexpr int MAX_DIMS = 12;
-struct LoadWithoutCast {
-  template <typename scalar_t>
-  C10_DEVICE scalar_t load(char* base_ptr, uint32_t offset, int arg) {
-    return c10::load(reinterpret_cast<scalar_t*>(base_ptr) + offset);
-  }
-};
-struct StoreWithoutCast {
-  template <typename scalar_t>
-  C10_DEVICE void store(scalar_t value, char* base_ptr, uint32_t offset, int arg = 0) {
-    *(reinterpret_cast<scalar_t*>(base_ptr) + offset) = value;
-  }
-};
-template <template <int i> typename func, int end, int current = 0>
-struct static_unroll {
-  template <typename... Args>
-  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
-    func<current>::apply(std::forward<Args>(args)...);
-    static_unroll<func, end, current + 1>::with_args(args...);
-  }
-};
-template <template <int i> typename func, int end>
-struct static_unroll<func, end, end> {
-  template <typename... Args>
-  static inline C10_HOST_DEVICE void with_args(Args... args) {}
-};
-template <int current>
-struct multi_outputs_store_helper {
-  template <int ntensors, int num_outputs, typename... Args>
-  static C10_HOST_DEVICE void apply(
-      at::detail::Array<char*, ntensors> data,
-      at::detail::Array<uint32_t, num_outputs> offsets,
-      std::tuple<Args...> ret) {
-    using T = typename std::tuple_element<current, std::tuple<Args...>>::type;
-    T* to = reinterpret_cast<T*>(data[current]) + offsets[current];
-    *to = std::get<current>(ret);
-  }
-};
-template <int arg_index>
-struct unroll_load_helper {
-  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
-  static C10_DEVICE void apply(
-      policy_t& self,
-      args_t* args,
-      offset_t offset,
-      loader_t loader,
-      int j,
-      int num_outputs) {
-    using arg_t = std::tuple_element_t<arg_index, args_t>;
-    std::get<arg_index>(args[j]) = loader.template load<arg_t>(
-        self.data[arg_index + num_outputs], offset[arg_index], arg_index);
-  }
-};
-template <int item_work_size, typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
-struct multi_outputs_unroll {
-  data_t data;
-  int remaining;
-  inp_calc_t input_offset_calculator;
-  out_calc_t output_offset_calculator;
-  LoadWithoutCast loader;
-  StoreWithoutCast storer;
-  int item_idx;
-  int group_idx;
-  int num_items_per_group;
-  int group_work_size;
-  multi_outputs_unroll(
-      data_t data,
-      int remaining,
-      inp_calc_t ic,
-      out_calc_t oc,
-      int item_idx,
-      int group_idx,
-      int num_items_per_group)
-      : data(data),
-        remaining(remaining),
-        input_offset_calculator(ic),
-        output_offset_calculator(oc),
-        item_idx(item_idx),
-        group_idx(group_idx),
-        num_items_per_group(num_items_per_group),
-        group_work_size(item_work_size * num_items_per_group) {}
-  inline bool check_inbounds(int item_work_elem) const {
-    return (item_idx + item_work_elem * num_items_per_group < remaining);
-  }
-  template <typename args_t>
-  inline void load(args_t* args) {
-    constexpr int arity = std::tuple_size<args_t>::value;
-    int item_idx_ = item_idx;
-#pragma unroll
-    for (int i = 0; i < item_work_size; i++) {
-      if (item_idx_ >= remaining) {
-        return;
-      }
-      int linear_idx = item_idx_ + group_work_size * group_idx;
-      auto offset = input_offset_calculator.get(linear_idx);
-      static_unroll<unroll_load_helper, arity>::with_args(
-          *this, args, offset, loader, i, num_outputs);
-      item_idx_ += num_items_per_group;
-    }
-  }
-  template <typename return_t>
-  inline void store(return_t* from) {
-    int item_idx_ = item_idx;
-#pragma unroll
-    for (int i = 0; i < item_work_size; i++) {
-      if (item_idx_ >= this->remaining) {
-        return;
-      }
-      int linear_idx = item_idx_ + group_work_size * group_idx;
-      auto offsets = this->output_offset_calculator.get(linear_idx);
-      static_unroll<multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
-      item_idx_ += num_items_per_group;
-    }
-  }
-};
-template <int item_work_size, typename func_t, typename policy_t>
-inline void elementwise_kernel_helper(func_t f, policy_t policy) {
-  using traits = function_traits<func_t>;
-  using return_t = typename traits::result_type;
-  using args_t = typename traits::ArgsTuple;
-  return_t results[item_work_size];
-  args_t args[item_work_size];
-  policy.load(args);
-#pragma unroll
-  for (int i = 0; i < item_work_size; i++) {
-    if (policy.check_inbounds(i)) {
-      results[i] = std::apply(f, args[i]);
-    }
-  }
-  policy.store(results);
-}
-template <int num_outputs, typename func_t, typename array_t, typename in_calc_t, typename out_calc_t>
-struct UnrolledElementwiseForMultiOutputsKernel {
-  static constexpr int item_work_size = 4;
-  void operator()(sycl::nd_item<1> item_id) const {
-    int grpsz = item_id.get_local_range(0);
-    int grpid = item_id.get_group(0);
-    int lid = item_id.get_local_id(0);
-    int remaining = numel_ - item_work_size * grpsz * grpid;
-    auto policy = multi_outputs_unroll<item_work_size, array_t, in_calc_t, out_calc_t, num_outputs>(
-        data_, remaining, ic_, oc_, lid, grpid, grpsz);
-    elementwise_kernel_helper<item_work_size>(f_, policy);
-  };
-  UnrolledElementwiseForMultiOutputsKernel(int numel, func_t f, array_t data, in_calc_t ic, out_calc_t oc)
-      : numel_(numel), f_(f), data_(data), ic_(ic), oc_(oc) {}
- private:
-  int numel_;
-  func_t f_;
-  array_t data_;
-  in_calc_t ic_;
-  out_calc_t oc_;
-};
-template <typename Value>
-struct IntDivider {
-  IntDivider() = default;
-  IntDivider(Value d) : divisor(d) {}
-  C10_HOST_DEVICE inline Value div(Value n) const {
-    return n / divisor;
-  }
-  C10_HOST_DEVICE inline Value mod(Value n) const {
-    return n % divisor;
-  }
-  C10_HOST_DEVICE inline auto divmod(Value n) const {
-    return std::make_pair(n / divisor, n % divisor);
-  }
-  Value divisor;
-};
-template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
-struct OffsetCalculator {
-  using stride_t = std::conditional_t<signed_strides, std::make_signed_t<index_t>, index_t>;
-  using offset_type = at::detail::Array<stride_t, std::max<int>(NARGS, 1)>;
-  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes = nullptr)
-      : dims(dims) {
-    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
-    for (int i = 0; i < dims; i++) {
-      sizes_[i] = IntDivider<index_t>(sizes[i]);
-      for (int arg = 0; arg < NARGS; arg++) {
-        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
-        strides_[i][arg] = strides[arg][i] / element_size;
-      }
-    }
-  }
-  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
-    offset_type offsets;
-#pragma unroll
-    for (int arg = 0; arg < NARGS; arg++) {
-      offsets[arg] = 0;
-    }
-#pragma unroll
-    for (int dim = 0; dim < MAX_DIMS; ++dim) {
-      if (dim == dims) {
-        break;
-      }
-      auto divmod = sizes_[dim].divmod(linear_idx);
-      linear_idx = divmod.first;
-#pragma unroll
-      for (int arg = 0; arg < NARGS; arg++) {
-        offsets[arg] += divmod.second * strides_[dim][arg];
-      }
-    }
-    return offsets;
-  }
-  int dims;
-  IntDivider<index_t> sizes_[MAX_DIMS];
-  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
-};
-template <int N>
-static OffsetCalculator<N> make_input_offset_calculator(const at::TensorIteratorBase& iter) {
-  constexpr int array_size = std::max<int>(N, 1);
-  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
-  std::array<const int64_t*, array_size> strides;
-  int64_t element_sizes[array_size];
-  for (int i = 0; i < N; i++) {
-    strides[i] = iter.strides(i + iter.noutputs()).data();
-    element_sizes[i] = iter.element_size(i + iter.noutputs());
-  }
-  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
-}
-template <int num_outputs = 1>
-static OffsetCalculator<num_outputs> make_output_offset_calculator(const at::TensorIteratorBase& iter) {
-  TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
-  std::array<const int64_t*, num_outputs> strides;
-  int64_t element_sizes[num_outputs];
-  for (int i = 0; i < num_outputs; i++) {
-    strides[i] = iter.strides(i).data();
-    element_sizes[i] = iter.element_size(i);
-  }
-  return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
-}
-static inline int64_t syclMaxWorkItemsPerSubSlice(at::DeviceIndex dev_id = c10::xpu::getCurrentXPUStream().device_index()) {
-  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
-  int64_t simd_width = dev_prop->sub_group_sizes[0];
-  int64_t eu_count = dev_prop->gpu_eu_count_per_subslice;
-  return simd_width * eu_count;
-}
-template<class T>
-T ceil_div(T dividend, T divisor) {
-    return (dividend + divisor - 1) / divisor;
-}
-template <typename ker_t>
-static inline void sycl_kernel_submit(int64_t global_range, int64_t local_range, ::sycl::queue q, ker_t ker) {
-  q.parallel_for(
-    sycl::nd_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)),
-    ker
-  );
-}
-template <int num_outputs, typename func_t, typename array_t, typename in_calc_t, typename out_calc_t>
-static inline void launch_unrolled_kernel_for_multi_outputs(
-    int64_t N,
-    const func_t& f,
-    array_t data,
-    in_calc_t ic,
-    out_calc_t oc) {
-  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
-  auto ker = UnrolledElementwiseForMultiOutputsKernel<num_outputs, func_t, array_t, in_calc_t, out_calc_t>(N, f, data, ic, oc);
-  using ker_t = decltype(ker);
-  int wg_sz = syclMaxWorkItemsPerSubSlice();
-  int num_wg = ceil_div<int>(N, ker_t::item_work_size * wg_sz);
-  sycl_kernel_submit(wg_sz * num_wg, wg_sz, c10::xpu::getCurrentXPUStream().queue(), ker);
-}
-template <int N>
-struct TrivialOffsetCalculator {
-  using offset_type = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
-  C10_HOST_DEVICE offset_type get(uint32_t linear_idx) const {
-    offset_type offsets;
-#pragma unroll
-    for (int arg = 0; arg < N; arg++) {
-      offsets[arg] = linear_idx;
-    }
-    return offsets;
-  }
-};
-template <typename func_t>
-void gpu_kernel_multiple_outputs_impl(at::TensorIteratorBase& iter, const func_t& f) {
-  using traits = function_traits<func_t>;
-  using output_t = typename traits::result_type;
-  constexpr int num_outputs = std::tuple_size<output_t>::value;
-  constexpr int num_inputs = traits::arity;
-  constexpr int ntensors = num_outputs + num_inputs;
-  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
-  TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors);
-  at::detail::Array<char*, ntensors> data;
-  for (int i = 0; i < ntensors; i++) {
-    data[i] = (char*)iter.data_ptr(i);
-  }
-  int64_t numel = iter.numel();
-  if (iter.is_contiguous()) {
-    auto input_calc = TrivialOffsetCalculator<num_inputs>();
-    auto output_calc = TrivialOffsetCalculator<num_outputs>();
-    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
-  } else {
-    auto input_calc = make_input_offset_calculator<num_inputs>(iter);
-    auto output_calc = make_output_offset_calculator<num_outputs>(iter);
-    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
-  }
-}
-template <typename func_t>
-void gpu_kernel_multiple_outputs(at::TensorIteratorBase& iter, const func_t& f) {
-  for (int arg = 0; arg < iter.ntensors(); arg++) {
-    TORCH_INTERNAL_ASSERT(iter.device(arg).is_xpu());
-  }
-  if (iter.numel() == 0) {
-    return;
-  }
-  if (!iter.can_use_32bit_indexing()) {
-    for (auto& sub_iter : iter.with_32bit_indexing()) {
-      gpu_kernel_multiple_outputs(sub_iter, f);
-    }
-    return;
-  }
-  gpu_kernel_multiple_outputs_impl(iter, f);
-}

rotary/rotary_cuda.cu DELETED Viewed

@@ -1,45 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2023, Tri Dao.
- ******************************************************************************/
-#include <torch/all.h>
-#include <ATen/native/TensorIterator.h>
-#include <ATen/native/cuda/Loops.cuh>
-void _apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
-                       torch::Tensor const &cos, torch::Tensor const &sin,
-                       torch::Tensor &out1, torch::Tensor &out2,
-                       bool const conj) {
-    auto iter = at::TensorIteratorConfig()
-        .add_output(out1)
-        .add_output(out2)
-        .add_input(x1)
-        .add_input(x2)
-        .add_input(cos)
-        .add_input(sin)
-        .check_all_same_dtype(false)
-        .promote_inputs_to_common_dtype(false)
-        .build();
-    if (!conj) {
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
-            at::native::gpu_kernel_multiple_outputs(
-                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
-                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
-                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
-                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
-                return {out1, out2};
-            });
-        });
-    } else {
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel", [&] {
-            at::native::gpu_kernel_multiple_outputs(
-                iter, [] GPU_LAMBDA (scalar_t x1, scalar_t x2, scalar_t cos,
-                                    scalar_t sin) -> thrust::tuple<scalar_t, scalar_t> {
-                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
-                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
-                return {out1, out2};
-            });
-        });
-    }
-}

tests/__init__.py DELETED Viewed

File without changes

tests/test_rotary.py DELETED Viewed

@@ -1,130 +0,0 @@
-import pytest
-import torch
-from tests.utils import infer_device, supports_bfloat16
-from pathlib import Path
-# import rotary
-# from transformers.trainer_utils import set_seed
-# set_seed(42)
-# Set the local repo path, relative path
-try:
-    import rotary
-except ImportError:
-    from kernels import get_local_kernel
-    repo_path = Path(__file__).parent.parent
-    rotary = get_local_kernel(repo_path=repo_path, package_name="rotary")
-def apply_rotary_torch(x1: torch.Tensor, x2: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, conj: bool = False):
-    assert x1.shape == x2.shape, "x1 and x2 must have the same shape"
-    if not conj:
-        out1 = x1 * cos - x2 * sin
-        out2 = x1 * sin + x2 * cos
-    else:
-        out1 = x1 * cos + x2 * sin
-        out2 = -x1 * sin + x2 * cos
-    return out1, out2
-def apply_rotary_torch_wrapper(q, k, cos, sin, conj: bool = False):
-    """the wrapper for apply_rotary_torch"""
-    rotary_dim = cos.shape[-1]
-    # apply rotation encoding to Q
-    q1 = q[..., :rotary_dim]
-    q2 = q[..., rotary_dim : 2 * rotary_dim]
-    q_out_1, q_out_2 = apply_rotary_torch(q1, q2, cos, sin, conj)
-    q_out = torch.cat([q_out_1, q_out_2, q[..., 2 * rotary_dim:]], dim=-1)
-    # apply rotation encoding to K
-    k1 = k[..., :rotary_dim]
-    k2 = k[..., rotary_dim : 2 * rotary_dim]
-    k_out_1, k_out_2 = apply_rotary_torch(k1, k2, cos, sin, conj)
-    k_out = torch.cat([k_out_1, k_out_2, k[..., 2 * rotary_dim:]], dim=-1)
-    return q_out, k_out
-def apply_rotary_kernel_wrapper(q, k, cos, sin, conj: bool = False):
-    """the wrapper for apply_rotary_kernel"""
-    rotary_dim = cos.shape[-1]
-    # apply rotation encoding to Q
-    q1 = q[..., :rotary_dim]
-    q2 = q[..., rotary_dim : 2 * rotary_dim]
-    rotary.apply_rotary(q1, q2, cos, sin, q1, q2, conj)
-    # apply rotation encoding to K
-    k1 = k[..., :rotary_dim]
-    k2 = k[..., rotary_dim : 2 * rotary_dim]
-    rotary.apply_rotary(k1, k2, cos, sin, k1, k2, conj)
-@pytest.mark.parametrize("batch_size", [1, 2])
-@pytest.mark.parametrize("nheads", [8, 16])
-@pytest.mark.parametrize("seqlen", [128, 256])
-@pytest.mark.parametrize("headdim, rotary_dim", [(64, 32), (128, 64), (64, 30)])
-@pytest.mark.parametrize("qk_dim", [3, 4])
-@pytest.mark.parametrize(
-    "dtype, atol, rtol",
-    [
-        (torch.float32, 1e-5, 1e-5),
-        pytest.param(
-            torch.bfloat16,
-            1e-1,
-            1e-5,
-            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
-        ),
-    ],
-)
-@pytest.mark.parametrize("conj", [False, True])
-@pytest.mark.flaky(max_runs=2, min_passes=1)
-def test_rotary_equivalence(batch_size, nheads, seqlen, headdim, rotary_dim, qk_dim, dtype, atol, rtol, conj):
-    device = infer_device()
-    if device is None:
-        pytest.skip("No suitable device found for testing")
-    if qk_dim == 4:
-        q_shape = (batch_size, seqlen, nheads, headdim)
-        cos_sin_shape = (seqlen, 1, rotary_dim)
-    elif qk_dim == 3:
-        q_shape = (batch_size * seqlen, nheads, headdim)
-        cos_sin_shape = (batch_size * seqlen, 1, rotary_dim)
-    q_orig = torch.randn(q_shape, device=device, dtype=dtype)
-    k_orig = torch.randn(q_shape, device=device, dtype=dtype)
-    cos = torch.randn(cos_sin_shape, device=device, dtype=dtype)
-    sin = torch.randn(cos_sin_shape, device=device, dtype=dtype)
-    q_kernel, k_kernel = q_orig.clone(), k_orig.clone()
-    q_torch, k_torch = q_orig.clone(), k_orig.clone()
-    q_torch_out, k_torch_out = apply_rotary_torch_wrapper(q_torch, k_torch, cos, sin, conj)
-    apply_rotary_kernel_wrapper(q_kernel, k_kernel, cos, sin, conj)
-    # verify the rotation results of Q and K are consistent
-    try:
-        assert torch.allclose(q_torch_out, q_kernel, atol=atol, rtol=rtol), "Rotary transformation results for Q do not match"
-    except AssertionError:
-        diff_q = torch.abs(q_torch_out - q_kernel)
-        max_diff_q = torch.max(diff_q)
-        print(f"Max difference for Q: {max_diff_q}")
-        raise
-    try:
-        assert torch.allclose(k_torch_out, k_kernel, atol=atol, rtol=rtol), "Rotary transformation results for K do not match"
-    except AssertionError:
-        diff_k = torch.abs(k_torch_out - k_kernel)
-        max_diff_k = torch.max(diff_k)
-        print(f"Max difference for K: {max_diff_k}")
-        raise
-    # verify the non-rotated part of Q and K remains unchanged
-    if (2 * rotary_dim) < headdim:
-        assert torch.equal(
-            q_kernel[..., 2 * rotary_dim:], q_orig[..., 2 * rotary_dim:]
-        ), "Non-rotated part of Q should be unchanged"
-        assert torch.equal(
-            k_kernel[..., 2 * rotary_dim:], k_orig[..., 2 * rotary_dim:]
-        ), "Non-rotated part of K should be unchanged"

tests/utils.py DELETED Viewed

@@ -1,23 +0,0 @@
-import torch
-def infer_device():
-    """
-    Get current device name based on available devices
-    """
-    if torch.cuda.is_available():  # Works for both Nvidia and AMD
-        return "cuda"
-    elif torch.xpu.is_available():
-        return "xpu"
-    else:
-        return None
-def supports_bfloat16():
-    device = infer_device()
-    if device == "cuda":
-        return torch.cuda.get_device_capability() >= (8, 0)  # Ampere and newer
-    elif device == "xpu":
-        return True
-    else:
-        return False

torch-ext/rotary/__init__.py DELETED Viewed

@@ -1,19 +0,0 @@
-from typing import Tuple
-import torch
-from ._ops import ops
-def apply_rotary(
-    x1: torch.Tensor,
-    x2: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
-    out1: torch.Tensor,
-    out2: torch.Tensor,
-    conj: bool,
-):
-    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
-__all__ = ["apply_rotary"]

torch-ext/torch_binding.cpp DELETED Viewed

@@ -1,54 +0,0 @@
-#include <torch/all.h>
-#if defined(CUDA_KERNEL)
-#include <c10/cuda/CUDAGuard.h>
-#elif defined(XPU_KERNEL)
-#include <c10/core/DeviceGuard.h>
-#endif
-#include "registration.h"
-#define CHECK_DEVICE(x) TORCH_CHECK(x.device().type() == torch::kCUDA || x.device().type() == torch::kXPU, #x " must be on CUDA or XPU")
-#define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
-void _apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
-                       torch::Tensor const &cos, torch::Tensor const &sin,
-                       torch::Tensor &out1, torch::Tensor &out2,
-                       bool const conj);
-void apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
-                  torch::Tensor const &cos, torch::Tensor const &sin,
-                  torch::Tensor &out1, torch::Tensor &out2,
-                  bool const conj) {
-    CHECK_DEVICE(x1); CHECK_DEVICE(x2);
-    CHECK_DEVICE(cos); CHECK_DEVICE(sin);
-    CHECK_DEVICE(out1); CHECK_DEVICE(out1);
-    TORCH_CHECK(x1.dtype() == x2.dtype());
-    TORCH_CHECK(cos.dtype() == sin.dtype());
-    TORCH_CHECK(out1.dtype() == out2.dtype());
-    TORCH_CHECK(x1.dtype() == cos.dtype());
-    TORCH_CHECK(x1.dtype() == out1.dtype());
-    TORCH_CHECK(x1.sizes() == x2.sizes());
-    TORCH_CHECK(cos.sizes() == sin.sizes());
-    TORCH_CHECK(out1.sizes() == out2.sizes());
-#if defined(CUDA_KERNEL)
-    // Otherwise the kernel will be launched from cuda:0 device
-    at::cuda::CUDAGuard device_guard{x1.device()};
-#elif defined(XPU_KERNEL)
-    c10::DeviceGuard device_guard{x1.device()};
-#endif
-    _apply_rotary(x1, x2, cos, sin, out1, out2, conj);
-}
-TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
-  ops.def("apply_rotary(Tensor x1, Tensor x2, Tensor cos, Tensor sin,"
-          "Tensor! out1, Tensor! out2, bool conj) -> ()");
-#if defined(CUDA_KERNEL)
-    ops.impl("apply_rotary", torch::kCUDA, &apply_rotary);
-#elif defined(XPU_KERNEL)
-    ops.impl("apply_rotary", torch::kXPU, &apply_rotary);
-#endif
-}
-REGISTER_EXTENSION(TORCH_EXTENSION_NAME)