YangKai0616 commited on Sep 12

Commit

b87db80

1 Parent(s): 74b8263

Add support for XPU(sycl)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

build.toml +8 -0
build/torch27-cxx11-cu118-x86_64-linux/rotary/__init__.py +0 -0
build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu118-x86_64-linux/rotary/_ops.py +3 -3
build/torch27-cxx11-cu118-x86_64-linux/rotary/{_rotary_6b8e81d.abi3.so → _rotary_cd1413b_dirty.abi3.so} +2 -2
build/torch27-cxx11-cu126-x86_64-linux/rotary/__init__.py +0 -0
build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu126-x86_64-linux/rotary/_ops.py +3 -3
build/torch27-cxx11-cu126-x86_64-linux/rotary/{_rotary_6b8e81d.abi3.so → _rotary_cd1413b_dirty.abi3.so} +2 -2
build/torch27-cxx11-cu128-x86_64-linux/rotary/__init__.py +0 -0
build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-cu128-x86_64-linux/rotary/_ops.py +3 -3
build/torch27-cxx11-cu128-x86_64-linux/rotary/{_rotary_6b8e81d.abi3.so → _rotary_cd1413b_dirty.abi3.so} +2 -2
build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__init__.py +19 -0
build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_ops.py +9 -0
build/{torch28-cxx11-cu126-x86_64-linux/rotary/_rotary_d5e8892.abi3.so → torch27-cxx11-xpu20250-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so} +2 -2
build/torch28-cxx11-cu126-x86_64-linux/rotary/__init__.py +0 -0
build/torch28-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu126-x86_64-linux/rotary/_ops.py +3 -3
build/torch28-cxx11-cu126-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/rotary/__init__.py +0 -0
build/torch28-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu128-x86_64-linux/rotary/_ops.py +3 -3
build/torch28-cxx11-cu128-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so +3 -0
build/torch28-cxx11-cu128-x86_64-linux/rotary/_rotary_d5e8892.abi3.so +0 -3
build/torch28-cxx11-cu129-x86_64-linux/rotary/__init__.py +0 -0
build/torch28-cxx11-cu129-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-cu129-x86_64-linux/rotary/_ops.py +3 -3
build/torch28-cxx11-cu129-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so +3 -0
build/torch28-cxx11-cu129-x86_64-linux/rotary/_rotary_d5e8892.abi3.so +0 -3
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__init__.py +19 -0
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/__init__.cpython-311.pyc +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/_ops.cpython-311.pyc +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc +0 -0
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/_ops.py +9 -0
build/torch28-cxx11-xpu20251-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so +3 -0
flake.lock +13 -14
flake.nix +3 -9
rotary-xpu/rotary_xpu.cpp +40 -0
rotary-xpu/rotary_xpu.hpp +375 -0
tests/__init__.py +0 -0

build.toml CHANGED Viewed

@@ -9,3 +9,11 @@ src = ["torch-ext/torch_binding.cpp"]
 backend = "cuda"
 depends = ["torch"]
 src = ["rotary/rotary_cuda.cu"]

 backend = "cuda"
 depends = ["torch"]
 src = ["rotary/rotary_cuda.cu"]
+[kernel.rotary_xpu]
+backend = "xpu"
+depends = ["torch"]
+src = [
+    "rotary-xpu/rotary_xpu.cpp",
+    "rotary-xpu/rotary_xpu.hpp",
+    ]

build/torch27-cxx11-cu118-x86_64-linux/rotary/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (843 Bytes). View file

build/torch27-cxx11-cu118-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

build/torch27-cxx11-cu118-x86_64-linux/rotary/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _rotary_6b8e81d
-ops = torch.ops._rotary_6b8e81d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_rotary_6b8e81d::{op_name}"

 import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch27-cxx11-cu118-x86_64-linux/rotary/{_rotary_6b8e81d.abi3.so → _rotary_cd1413b_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e67a5779587b10616a0e19961ae50495c367f53da7c4a40a1c9b1f557537441d
-size 6287712

 version https://git-lfs.github.com/spec/v1
+oid sha256:51c8d8635b97b599a33ba169458b47b9276f673c678c413107a5cab5a835f90e
+size 6807672

build/torch27-cxx11-cu126-x86_64-linux/rotary/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (843 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

build/torch27-cxx11-cu126-x86_64-linux/rotary/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _rotary_6b8e81d
-ops = torch.ops._rotary_6b8e81d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_rotary_6b8e81d::{op_name}"

 import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch27-cxx11-cu126-x86_64-linux/rotary/{_rotary_6b8e81d.abi3.so → _rotary_cd1413b_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ced538b9a02abba45b7769dd09902cbc1816091ec50da2b65c0549f85974ea4c
-size 6296464

 version https://git-lfs.github.com/spec/v1
+oid sha256:6b49a2fb4c22c6cda6d4d28d1f5eb3ad84801174c7790628519f0c7529a57773
+size 6820520

build/torch27-cxx11-cu128-x86_64-linux/rotary/__init__.py CHANGED Viewed

File without changes

build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (843 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

build/torch27-cxx11-cu128-x86_64-linux/rotary/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _rotary_6b8e81d
-ops = torch.ops._rotary_6b8e81d
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_rotary_6b8e81d::{op_name}"

 import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch27-cxx11-cu128-x86_64-linux/rotary/{_rotary_6b8e81d.abi3.so → _rotary_cd1413b_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e4a64022928f62fe0b476c54f5a426b856296fc2bba796c7e9fe6406c6d65485
-size 10157336

 version https://git-lfs.github.com/spec/v1
+oid sha256:638bbc069d927f9e37f1e720e73ee4af097ec16fc882f7abcc04dae2045b80a1
+size 10529832

build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import Tuple
+import torch
+from ._ops import ops
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+__all__ = ["apply_rotary"]

build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (846 Bytes). View file

build/torch27-cxx11-xpu20250-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (560 Bytes). View file

build/torch27-cxx11-xpu20250-x86_64-linux/rotary/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/{torch28-cxx11-cu126-x86_64-linux/rotary/_rotary_d5e8892.abi3.so → torch27-cxx11-xpu20250-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:663ac605052413c175dcff4dec8545aa37ad67e8629002621741b89b182d98cd
-size 6293168

 version https://git-lfs.github.com/spec/v1
+oid sha256:f7f449e098ab5bbe9ca35e2a904132ed4e378e54d579aefe95e4e83e07a73bfe
+size 2248696

build/torch28-cxx11-cu126-x86_64-linux/rotary/__init__.py CHANGED Viewed

File without changes

build/torch28-cxx11-cu126-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (843 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

build/torch28-cxx11-cu126-x86_64-linux/rotary/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _rotary_d5e8892
-ops = torch.ops._rotary_d5e8892
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_rotary_d5e8892::{op_name}"

 import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch28-cxx11-cu126-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea610dc89fe7d0037e55ff171158e099f79a08fcd14a8aff117c7b090d79a6e2
+size 6817216

build/torch28-cxx11-cu128-x86_64-linux/rotary/__init__.py CHANGED Viewed

File without changes

build/torch28-cxx11-cu128-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (843 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

build/torch28-cxx11-cu128-x86_64-linux/rotary/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _rotary_d5e8892
-ops = torch.ops._rotary_d5e8892
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_rotary_d5e8892::{op_name}"

 import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch28-cxx11-cu128-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f8af2b94a8121a7a8b6ac446cb6eb117d49cb4ea8842d7024bb1b9b26fb97db
+size 10526424

build/torch28-cxx11-cu128-x86_64-linux/rotary/_rotary_d5e8892.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:264410fa13bca33e706d1c3eb12a2d966e8fa07e2b786cbd8332d462f4883d1a
-size 10149824

build/torch28-cxx11-cu129-x86_64-linux/rotary/__init__.py CHANGED Viewed

File without changes

build/torch28-cxx11-cu129-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (843 Bytes). View file

build/torch28-cxx11-cu129-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (557 Bytes). View file

build/torch28-cxx11-cu129-x86_64-linux/rotary/_ops.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
-from . import _rotary_d5e8892
-ops = torch.ops._rotary_d5e8892
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
-    return f"_rotary_d5e8892::{op_name}"

 import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
 def add_op_namespace_prefix(op_name: str):
     """
     Prefix op by namespace.
     """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch28-cxx11-cu129-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88098a942da850ef34bc5d4b2f810d9c3092718c134fba911161a04eba73c559
+size 10586840

build/torch28-cxx11-cu129-x86_64-linux/rotary/_rotary_d5e8892.abi3.so DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dd62535c2713d05e74f4b53c84564caeeba51aaf06f5fe59a3182b04a5ae3c5a
-size 10169280

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from typing import Tuple
+import torch
+from ._ops import ops
+def apply_rotary(
+    x1: torch.Tensor,
+    x2: torch.Tensor,
+    cos: torch.Tensor,
+    sin: torch.Tensor,
+    out1: torch.Tensor,
+    out2: torch.Tensor,
+    conj: bool,
+):
+    ops.apply_rotary(x1, x2, cos, sin, out1, out2, conj)
+__all__ = ["apply_rotary"]

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (816 Bytes). View file

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (846 Bytes). View file

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/_ops.cpython-311.pyc ADDED Viewed

Binary file (558 Bytes). View file

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/__pycache__/_ops.cpython-313.pyc ADDED Viewed

Binary file (560 Bytes). View file

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/_ops.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import torch
+from . import _rotary_cd1413b_dirty
+ops = torch.ops._rotary_cd1413b_dirty
+def add_op_namespace_prefix(op_name: str):
+    """
+    Prefix op by namespace.
+    """
+    return f"_rotary_cd1413b_dirty::{op_name}"

build/torch28-cxx11-xpu20251-x86_64-linux/rotary/_rotary_cd1413b_dirty.abi3.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:12dba600201d5c2bd5cd123afc3b65f835f3698a89b75e61c85ee3f359f2e901
+size 2239816

flake.lock CHANGED Viewed

@@ -17,11 +17,11 @@
     },
     "flake-compat_2": {
       "locked": {
-        "lastModified": 1733328505,
-        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
         "owner": "edolstra",
         "repo": "flake-compat",
-        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
         "type": "github"
       },
       "original": {
@@ -73,11 +73,11 @@
         "nixpkgs": "nixpkgs"
       },
       "locked": {
-        "lastModified": 1753354560,
-        "narHash": "sha256-vmOfRmr0Qm/IbZTWB2sBn+UFrABSTTA/cTg+m27Yt/E=",
         "owner": "huggingface",
         "repo": "hf-nix",
-        "rev": "7f2aceda2a2e72cd573bdb25e5c0667fd75f89d3",
         "type": "github"
       },
       "original": {
@@ -98,33 +98,32 @@
         ]
       },
       "locked": {
-        "lastModified": 1753354632,
-        "narHash": "sha256-31SX3Raiyx0qCuY9JSlx9ZZgxljeUxvW+JdujjxbofQ=",
         "owner": "huggingface",
         "repo": "kernel-builder",
-        "rev": "524b628fd8e58525dbd28455bffb0628092c5265",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
-        "ref": "torch-2.8",
         "repo": "kernel-builder",
         "type": "github"
       }
     },
     "nixpkgs": {
       "locked": {
-        "lastModified": 1752785354,
-        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       },
       "original": {
         "owner": "nixos",
         "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
         "type": "github"
       }
     },

     },
     "flake-compat_2": {
       "locked": {
+        "lastModified": 1747046372,
+        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
         "owner": "edolstra",
         "repo": "flake-compat",
+        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
         "type": "github"
       },
       "original": {
         "nixpkgs": "nixpkgs"
       },
       "locked": {
+        "lastModified": 1757493151,
+        "narHash": "sha256-eirWlcvs2rjZmU8JcF4CKN1IEnNfpQnGuf2qbK3IQh8=",
         "owner": "huggingface",
         "repo": "hf-nix",
+        "rev": "503cd4eb9866103c983dbef93d9ad5db4fb6b415",
         "type": "github"
       },
       "original": {
         ]
       },
       "locked": {
+        "lastModified": 1757570810,
+        "narHash": "sha256-YFWQwy2LKbhjdLW8wkyNkE/+Vbdn6qlJif2CKvBT9Qo=",
         "owner": "huggingface",
         "repo": "kernel-builder",
+        "rev": "1201847af3ff757b65015c6e06b5bd75896d2d4b",
         "type": "github"
       },
       "original": {
         "owner": "huggingface",
         "repo": "kernel-builder",
         "type": "github"
       }
     },
     "nixpkgs": {
       "locked": {
+        "lastModified": 1755963616,
+        "narHash": "sha256-6yD0ww/S8n+U2uPYcJZ3DRURP8Kx036GRpR2uPNZroE=",
         "owner": "nixos",
         "repo": "nixpkgs",
+        "rev": "73e96df7cff5783f45e21342a75a1540c4eddce4",
         "type": "github"
       },
       "original": {
         "owner": "nixos",
+        "ref": "nixos-unstable-small",
         "repo": "nixpkgs",
         "type": "github"
       }
     },

flake.nix CHANGED Viewed

@@ -1,15 +1,9 @@
 {
-  description = "Flake for rotary kernel";
   inputs = {
-    kernel-builder.url = "github:huggingface/kernel-builder/torch-2.8";
   };
-  outputs =
-    {
-      self,
-      kernel-builder,
-    }:
     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;

 {
+  description = "Flake for Torch kernel extension";
   inputs = {
+    kernel-builder.url = "github:huggingface/kernel-builder";
   };
+  outputs = { self, kernel-builder, }:
     kernel-builder.lib.genFlakeOutputs {
       path = ./.;
       rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;

rotary-xpu/rotary_xpu.cpp ADDED Viewed

	@@ -0,0 +1,40 @@

+#include <torch/all.h>
+#include "rotary_xpu.hpp"
+void _apply_rotary(torch::Tensor const &x1, torch::Tensor const &x2,
+                       torch::Tensor const &cos, torch::Tensor const &sin,
+                       torch::Tensor &out1, torch::Tensor &out2,
+                       bool const conj) {
+    auto iter = at::TensorIteratorConfig()
+        .add_output(out1)
+        .add_output(out2)
+        .add_input(x1)
+        .add_input(x2)
+        .add_input(cos)
+        .add_input(sin)
+        .check_all_same_dtype(false)
+        .promote_inputs_to_common_dtype(false)
+        .build();
+    if (!conj) {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel_xpu", [&] {
+            gpu_kernel_multiple_outputs(
+                iter, [] (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> std::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) - float(x2) * float(sin);
+                scalar_t out2 = float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    } else {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::kBFloat16, at::kHalf, x1.scalar_type(), "rotary_kernel_xpu", [&] {
+            gpu_kernel_multiple_outputs(
+                iter, [] (scalar_t x1, scalar_t x2, scalar_t cos,
+                                    scalar_t sin) -> std::tuple<scalar_t, scalar_t> {
+                scalar_t out1 = float(x1) * float(cos) + float(x2) * float(sin);
+                scalar_t out2 = -float(x1) * float(sin) + float(x2) * float(cos);
+                return {out1, out2};
+            });
+        });
+    }
+}

rotary-xpu/rotary_xpu.hpp ADDED Viewed

	@@ -0,0 +1,375 @@

+#include <ATen/core/TensorBody.h>
+#include <ATen/detail/FunctionTraits.h>
+#include <ATen/native/TensorIterator.h>
+#include <sycl/sycl.hpp>
+#include <ATen/core/Array.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+#include <c10/util/TypeCast.h>
+#include <cstdint>
+#include <type_traits>
+#include <array>
+#include <c10/core/ScalarType.h>
+#include <c10/xpu/XPUStream.h>
+#include <ATen/xpu/XPUContext.h>
+constexpr int MAX_DIMS = 12;
+struct LoadWithoutCast {
+  template <typename scalar_t>
+  C10_DEVICE scalar_t load(char* base_ptr, uint32_t offset, int arg) {
+    return c10::load(reinterpret_cast<scalar_t*>(base_ptr) + offset);
+  }
+};
+struct StoreWithoutCast {
+  template <typename scalar_t>
+  C10_DEVICE void store(scalar_t value, char* base_ptr, uint32_t offset, int arg = 0) {
+    *(reinterpret_cast<scalar_t*>(base_ptr) + offset) = value;
+  }
+};
+template <template <int i> typename func, int end, int current = 0>
+struct static_unroll {
+  template <typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args&&... args) {
+    func<current>::apply(std::forward<Args>(args)...);
+    static_unroll<func, end, current + 1>::with_args(args...);
+  }
+};
+template <template <int i> typename func, int end>
+struct static_unroll<func, end, end> {
+  template <typename... Args>
+  static inline C10_HOST_DEVICE void with_args(Args... args) {}
+};
+template <int current>
+struct multi_outputs_store_helper {
+  template <int ntensors, int num_outputs, typename... Args>
+  static C10_HOST_DEVICE void apply(
+      at::detail::Array<char*, ntensors> data,
+      at::detail::Array<uint32_t, num_outputs> offsets,
+      std::tuple<Args...> ret) {
+    using T = typename std::tuple_element<current, std::tuple<Args...>>::type;
+    T* to = reinterpret_cast<T*>(data[current]) + offsets[current];
+    *to = std::get<current>(ret);
+  }
+};
+template <int arg_index>
+struct unroll_load_helper {
+  template <typename args_t, typename policy_t, typename offset_t, typename loader_t>
+  static C10_DEVICE void apply(
+      policy_t& self,
+      args_t* args,
+      offset_t offset,
+      loader_t loader,
+      int j,
+      int num_outputs) {
+    using arg_t = std::tuple_element_t<arg_index, args_t>;
+    std::get<arg_index>(args[j]) = loader.template load<arg_t>(
+        self.data[arg_index + num_outputs], offset[arg_index], arg_index);
+  }
+};
+template <int item_work_size, typename data_t, typename inp_calc_t, typename out_calc_t, int num_outputs>
+struct multi_outputs_unroll {
+  data_t data;
+  int remaining;
+  inp_calc_t input_offset_calculator;
+  out_calc_t output_offset_calculator;
+  LoadWithoutCast loader;
+  StoreWithoutCast storer;
+  int item_idx;
+  int group_idx;
+  int num_items_per_group;
+  int group_work_size;
+  multi_outputs_unroll(
+      data_t data,
+      int remaining,
+      inp_calc_t ic,
+      out_calc_t oc,
+      int item_idx,
+      int group_idx,
+      int num_items_per_group)
+      : data(data),
+        remaining(remaining),
+        input_offset_calculator(ic),
+        output_offset_calculator(oc),
+        item_idx(item_idx),
+        group_idx(group_idx),
+        num_items_per_group(num_items_per_group),
+        group_work_size(item_work_size * num_items_per_group) {}
+  inline bool check_inbounds(int item_work_elem) const {
+    return (item_idx + item_work_elem * num_items_per_group < remaining);
+  }
+  template <typename args_t>
+  inline void load(args_t* args) {
+    constexpr int arity = std::tuple_size<args_t>::value;
+    int item_idx_ = item_idx;
+#pragma unroll
+    for (int i = 0; i < item_work_size; i++) {
+      if (item_idx_ >= remaining) {
+        return;
+      }
+      int linear_idx = item_idx_ + group_work_size * group_idx;
+      auto offset = input_offset_calculator.get(linear_idx);
+      static_unroll<unroll_load_helper, arity>::with_args(
+          *this, args, offset, loader, i, num_outputs);
+      item_idx_ += num_items_per_group;
+    }
+  }
+  template <typename return_t>
+  inline void store(return_t* from) {
+    int item_idx_ = item_idx;
+#pragma unroll
+    for (int i = 0; i < item_work_size; i++) {
+      if (item_idx_ >= this->remaining) {
+        return;
+      }
+      int linear_idx = item_idx_ + group_work_size * group_idx;
+      auto offsets = this->output_offset_calculator.get(linear_idx);
+      static_unroll<multi_outputs_store_helper, num_outputs>::with_args(this->data, offsets, from[i]);
+      item_idx_ += num_items_per_group;
+    }
+  }
+};
+template <int item_work_size, typename func_t, typename policy_t>
+inline void elementwise_kernel_helper(func_t f, policy_t policy) {
+  using traits = function_traits<func_t>;
+  using return_t = typename traits::result_type;
+  using args_t = typename traits::ArgsTuple;
+  return_t results[item_work_size];
+  args_t args[item_work_size];
+  policy.load(args);
+#pragma unroll
+  for (int i = 0; i < item_work_size; i++) {
+    if (policy.check_inbounds(i)) {
+      results[i] = std::apply(f, args[i]);
+    }
+  }
+  policy.store(results);
+}
+template <int num_outputs, typename func_t, typename array_t, typename in_calc_t, typename out_calc_t>
+struct UnrolledElementwiseForMultiOutputsKernel {
+  static constexpr int item_work_size = 4;
+  void operator()(sycl::nd_item<1> item_id) const {
+    int grpsz = item_id.get_local_range(0);
+    int grpid = item_id.get_group(0);
+    int lid = item_id.get_local_id(0);
+    int remaining = numel_ - item_work_size * grpsz * grpid;
+    auto policy = multi_outputs_unroll<item_work_size, array_t, in_calc_t, out_calc_t, num_outputs>(
+        data_, remaining, ic_, oc_, lid, grpid, grpsz);
+    elementwise_kernel_helper<item_work_size>(f_, policy);
+  };
+  UnrolledElementwiseForMultiOutputsKernel(int numel, func_t f, array_t data, in_calc_t ic, out_calc_t oc)
+      : numel_(numel), f_(f), data_(data), ic_(ic), oc_(oc) {}
+ private:
+  int numel_;
+  func_t f_;
+  array_t data_;
+  in_calc_t ic_;
+  out_calc_t oc_;
+};
+template <typename Value>
+struct IntDivider {
+  IntDivider() = default;
+  IntDivider(Value d) : divisor(d) {}
+  C10_HOST_DEVICE inline Value div(Value n) const {
+    return n / divisor;
+  }
+  C10_HOST_DEVICE inline Value mod(Value n) const {
+    return n % divisor;
+  }
+  C10_HOST_DEVICE inline auto divmod(Value n) const {
+    return std::make_pair(n / divisor, n % divisor);
+  }
+  Value divisor;
+};
+template <int NARGS, typename index_t = uint32_t, bool signed_strides = false>
+struct OffsetCalculator {
+  using stride_t = std::conditional_t<signed_strides, std::make_signed_t<index_t>, index_t>;
+  using offset_type = at::detail::Array<stride_t, std::max<int>(NARGS, 1)>;
+  OffsetCalculator(int dims, const int64_t* sizes, const int64_t* const* strides, const int64_t* element_sizes = nullptr)
+      : dims(dims) {
+    TORCH_CHECK(dims <= MAX_DIMS, "tensor has too many (>", MAX_DIMS, ") dims");
+    for (int i = 0; i < dims; i++) {
+      sizes_[i] = IntDivider<index_t>(sizes[i]);
+      for (int arg = 0; arg < NARGS; arg++) {
+        int64_t element_size = (element_sizes == nullptr ? 1LL : element_sizes[arg]);
+        strides_[i][arg] = strides[arg][i] / element_size;
+      }
+    }
+  }
+  C10_HOST_DEVICE offset_type get(index_t linear_idx) const {
+    offset_type offsets;
+#pragma unroll
+    for (int arg = 0; arg < NARGS; arg++) {
+      offsets[arg] = 0;
+    }
+#pragma unroll
+    for (int dim = 0; dim < MAX_DIMS; ++dim) {
+      if (dim == dims) {
+        break;
+      }
+      auto divmod = sizes_[dim].divmod(linear_idx);
+      linear_idx = divmod.first;
+#pragma unroll
+      for (int arg = 0; arg < NARGS; arg++) {
+        offsets[arg] += divmod.second * strides_[dim][arg];
+      }
+    }
+    return offsets;
+  }
+  int dims;
+  IntDivider<index_t> sizes_[MAX_DIMS];
+  stride_t strides_[MAX_DIMS][std::max<int>(NARGS, 1)];
+};
+template <int N>
+static OffsetCalculator<N> make_input_offset_calculator(const at::TensorIteratorBase& iter) {
+  constexpr int array_size = std::max<int>(N, 1);
+  TORCH_INTERNAL_ASSERT(N == iter.ntensors() - iter.noutputs());
+  std::array<const int64_t*, array_size> strides;
+  int64_t element_sizes[array_size];
+  for (int i = 0; i < N; i++) {
+    strides[i] = iter.strides(i + iter.noutputs()).data();
+    element_sizes[i] = iter.element_size(i + iter.noutputs());
+  }
+  return OffsetCalculator<N>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+template <int num_outputs = 1>
+static OffsetCalculator<num_outputs> make_output_offset_calculator(const at::TensorIteratorBase& iter) {
+  TORCH_INTERNAL_ASSERT(num_outputs == iter.noutputs());
+  std::array<const int64_t*, num_outputs> strides;
+  int64_t element_sizes[num_outputs];
+  for (int i = 0; i < num_outputs; i++) {
+    strides[i] = iter.strides(i).data();
+    element_sizes[i] = iter.element_size(i);
+  }
+  return OffsetCalculator<num_outputs>(iter.ndim(), iter.shape().data(), strides.data(), element_sizes);
+}
+static inline int64_t syclMaxWorkItemsPerSubSlice(at::DeviceIndex dev_id = c10::xpu::getCurrentXPUStream().device_index()) {
+  auto* dev_prop = at::xpu::getDeviceProperties(dev_id);
+  int64_t simd_width = dev_prop->sub_group_sizes[0];
+  int64_t eu_count = dev_prop->gpu_eu_count_per_subslice;
+  return simd_width * eu_count;
+}
+template<class T>
+T ceil_div(T dividend, T divisor) {
+    return (dividend + divisor - 1) / divisor;
+}
+template <typename ker_t>
+static inline void sycl_kernel_submit(int64_t global_range, int64_t local_range, ::sycl::queue q, ker_t ker) {
+  q.parallel_for(
+    sycl::nd_range<1>(sycl::range<1>(global_range), sycl::range<1>(local_range)),
+    ker
+  );
+}
+template <int num_outputs, typename func_t, typename array_t, typename in_calc_t, typename out_calc_t>
+static inline void launch_unrolled_kernel_for_multi_outputs(
+    int64_t N,
+    const func_t& f,
+    array_t data,
+    in_calc_t ic,
+    out_calc_t oc) {
+  TORCH_INTERNAL_ASSERT(N > 0 && N <= std::numeric_limits<int32_t>::max());
+  auto ker = UnrolledElementwiseForMultiOutputsKernel<num_outputs, func_t, array_t, in_calc_t, out_calc_t>(N, f, data, ic, oc);
+  using ker_t = decltype(ker);
+  int wg_sz = syclMaxWorkItemsPerSubSlice();
+  int num_wg = ceil_div<int>(N, ker_t::item_work_size * wg_sz);
+  sycl_kernel_submit(wg_sz * num_wg, wg_sz, c10::xpu::getCurrentXPUStream().queue(), ker);
+}
+template <int N>
+struct TrivialOffsetCalculator {
+  using offset_type = at::detail::Array<uint32_t, std::max<int>(N, 1)>;
+  C10_HOST_DEVICE offset_type get(uint32_t linear_idx) const {
+    offset_type offsets;
+#pragma unroll
+    for (int arg = 0; arg < N; arg++) {
+      offsets[arg] = linear_idx;
+    }
+    return offsets;
+  }
+};
+template <typename func_t>
+void gpu_kernel_multiple_outputs_impl(at::TensorIteratorBase& iter, const func_t& f) {
+  using traits = function_traits<func_t>;
+  using output_t = typename traits::result_type;
+  constexpr int num_outputs = std::tuple_size<output_t>::value;
+  constexpr int num_inputs = traits::arity;
+  constexpr int ntensors = num_outputs + num_inputs;
+  TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing());
+  TORCH_INTERNAL_ASSERT(iter.ntensors() == ntensors);
+  at::detail::Array<char*, ntensors> data;
+  for (int i = 0; i < ntensors; i++) {
+    data[i] = (char*)iter.data_ptr(i);
+  }
+  int64_t numel = iter.numel();
+  if (iter.is_contiguous()) {
+    auto input_calc = TrivialOffsetCalculator<num_inputs>();
+    auto output_calc = TrivialOffsetCalculator<num_outputs>();
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  } else {
+    auto input_calc = make_input_offset_calculator<num_inputs>(iter);
+    auto output_calc = make_output_offset_calculator<num_outputs>(iter);
+    launch_unrolled_kernel_for_multi_outputs<num_outputs>(numel, f, data, input_calc, output_calc);
+  }
+}
+template <typename func_t>
+void gpu_kernel_multiple_outputs(at::TensorIteratorBase& iter, const func_t& f) {
+  for (int arg = 0; arg < iter.ntensors(); arg++) {
+    TORCH_INTERNAL_ASSERT(iter.device(arg).is_xpu());
+  }
+  if (iter.numel() == 0) {
+    return;
+  }
+  if (!iter.can_use_32bit_indexing()) {
+    for (auto& sub_iter : iter.with_32bit_indexing()) {
+      gpu_kernel_multiple_outputs(sub_iter, f);
+    }
+    return;
+  }
+  gpu_kernel_multiple_outputs_impl(iter, f);
+}

tests/__init__.py ADDED Viewed

File without changes