#include #include #include #include #include #include #include #include #include #include #include #include #include #include constexpr int MAX_DIMS = 12; struct LoadWithoutCast { template C10_DEVICE scalar_t load(char* base_ptr, uint32_t offset, int arg) { return c10::load(reinterpret_cast(base_ptr) + offset); } }; struct StoreWithoutCast { template C10_DEVICE void store(scalar_t value, char* base_ptr, uint32_t offset, int arg = 0) { *(reinterpret_cast(base_ptr) + offset) = value; } }; template