kernels-community
/

metal-flash-sdpa

Kernels

Model card Files Files and versions

xet

Community

Eric Buehler commited on Jul 17

Commit

e7707ac

1 Parent(s): bc6a74d

Better testing

Browse files

Files changed (1) hide show

tests/test_flash_attention.py +135 -798

tests/test_flash_attention.py CHANGED Viewed

@@ -11,76 +11,73 @@ def create_cu_seqlens(seq_lengths):
     return torch.tensor(cu_seqlens, dtype=torch.int32, device="mps")
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_single_sequence(dtype, head_dim):
-    """Test Flash Attention with a single sequence."""
-    torch.manual_seed(42)
-    # Single sequence
-    seq_len = 32
-    num_heads = 4
-    # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens([seq_len])
-    # Create input tensors in Flash Attention format
-    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Compute ground truth
-    # Flash Attention computes attention separately for each head
-    expected = torch.zeros_like(out)
     for h in range(num_heads):
-        q_h = query[:, h, :]  # [seq_len, head_dim]
-        k_h = key[:, h, :]
-        v_h = value[:, h, :]
         scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
         attn_weights = torch.softmax(scores, dim=-1)
         expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 and float16)
     if dtype == torch.bfloat16:
-        # Higher tolerance for head_dim=128 with bfloat16
-        rtol, atol = (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
     elif dtype == torch.float16:
-        rtol, atol = 2e-3, 2e-3
     else:
-        rtol, atol = 1e-3, 1e-3
-    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_variable_lengths(dtype, head_dim):
-    """Test Flash Attention with variable-length sequences."""
     torch.manual_seed(42)
-    # Variable sequence lengths
-    seq_lengths_q = [8, 16, 12]
-    seq_lengths_k = [10, 20, 15]
-    batch_size = len(seq_lengths_q)
     num_heads = 4
     # Create cumulative sequence lengths
@@ -111,128 +108,48 @@ def test_flash_attention_variable_lengths(dtype, head_dim):
         cu_seqlens_k=cu_seqlens_k,
         max_seqlen_q=max_seqlen_q,
         max_seqlen_k=max_seqlen_k,
-        do_causal=False,
         scale=scale,
         softcapping=1.0,
     )
     # Compute ground truth for each sequence
     expected = torch.zeros_like(out)
     for i in range(batch_size):
         q_start, q_end = cu_seqlens_q[i].item(), cu_seqlens_q[i+1].item()
         k_start, k_end = cu_seqlens_k[i].item(), cu_seqlens_k[i+1].item()
-        q_i = query[q_start:q_end]
-        k_i = key[k_start:k_end]
-        v_i = value[k_start:k_end]
-        # Compute attention for each head separately
-        for h in range(num_heads):
-            q_h = q_i[:, h, :]  # [seq_len, head_dim]
-            k_h = k_i[:, h, :]
-            v_h = v_i[:, h, :]
-            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-            attn_weights = torch.softmax(scores, dim=-1)
-            expected[q_start:q_end, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 and float16)
-    if dtype == torch.bfloat16:
-        # Higher tolerance for bfloat16 with variable length sequences
-        rtol, atol = 2e-2, 2e-2
-    elif dtype == torch.float16:
-        rtol, atol = 2e-3, 2e-3
-    else:
-        rtol, atol = 1e-3, 1e-3
-    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_causal(dtype, head_dim):
-    """Test Flash Attention with causal masking."""
-    torch.manual_seed(42)
-    # Test dimensions
-    seq_lengths = [16, 24]
-    batch_size = len(seq_lengths)
-    num_heads = 4
-    # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens(seq_lengths)
-    total_tokens = sum(seq_lengths)
-    max_seqlen = max(seq_lengths)
-    # Create input tensors
-    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention with causal mask
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=max_seqlen,
-        max_seqlen_k=max_seqlen,
-        do_causal=True,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Compute ground truth with causal mask
-    expected = torch.zeros_like(out)
-    for i in range(batch_size):
-        start, end = cu_seqlens[i].item(), cu_seqlens[i+1].item()
-        seq_len = end - start
-        q_i = query[start:end]
-        k_i = key[start:end]
-        v_i = value[start:end]
-        # Compute attention for each head separately
-        for h in range(num_heads):
-            q_h = q_i[:, h, :]  # [seq_len, head_dim]
-            k_h = k_i[:, h, :]
-            v_h = v_i[:, h, :]
-            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-            # Apply causal mask
-            causal_mask = torch.triu(torch.ones(seq_len, seq_len, device="mps"), diagonal=1).bool()
-            scores.masked_fill_(causal_mask, float("-inf"))
-            attn_weights = torch.softmax(scores, dim=-1)
-            expected[start:end, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 and float16)
-    if dtype == torch.bfloat16:
-        # Higher tolerance for head_dim=128 with bfloat16
-        rtol, atol = (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
-    elif dtype == torch.float16:
-        rtol, atol = 2e-3, 2e-3
-    else:
-        rtol, atol = 1e-3, 1e-3
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_gqa(dtype, head_dim):
-    """Test Flash Attention with Grouped Query Attention."""
     torch.manual_seed(42)
-    # Test dimensions
-    seq_len = 32
-    num_heads = 8
-    num_kv_heads = 2  # GQA with 4:1 ratio
     # Create cumulative sequence lengths
     cu_seqlens = create_cu_seqlens([seq_len])
@@ -262,81 +179,28 @@ def test_flash_attention_gqa(dtype, head_dim):
     )
     # Compute ground truth with GQA
-    # Each query head attends to its corresponding kv head (with repetition)
-    expected = torch.zeros_like(query)
-    gqa_factor = num_heads // num_kv_heads
-    for h in range(num_heads):
-        kv_h = h // gqa_factor
-        q_h = query[:, h, :]  # [seq_len, head_dim]
-        k_h = key[:, kv_h, :]
-        v_h = value[:, kv_h, :]
-        scores = torch.matmul(q_h, k_h.transpose(-2, -1)) * scale
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 and float16)
-    if dtype == torch.bfloat16:
-        # Higher tolerance for head_dim=128 with bfloat16
-        rtol, atol = (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
-    elif dtype == torch.float16:
-        rtol, atol = 2e-3, 2e-3
-    else:
-        rtol, atol = 1e-3, 1e-3
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_head_dimensions(head_dim):
-    """Test Flash Attention with different supported head dimensions."""
-    torch.manual_seed(42)
-    # Test dimensions
-    seq_len = 16
-    num_heads = 4
-    # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens([seq_len])
-    # Create input tensors
-    query = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
-    key = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
-    value = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Basic check that output is not zeros
-    assert out.abs().max().item() > 0
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-def test_flash_attention_large_head_dim(dtype):
-    """Test Flash Attention with head_dim=128 specifically."""
     torch.manual_seed(42)
-    # Test dimensions with head_dim=128
-    seq_lengths = [32, 64]
-    batch_size = len(seq_lengths)
-    num_heads = 8
-    head_dim = 128
     # Create cumulative sequence lengths
     cu_seqlens = create_cu_seqlens(seq_lengths)
@@ -351,7 +215,7 @@ def test_flash_attention_large_head_dim(dtype):
     # Scale factor
     scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
     out = torch.empty_like(query)
     metal_flash_sdpa.flash_attention_varlen(
         out=out,
@@ -364,159 +228,87 @@ def test_flash_attention_large_head_dim(dtype):
         max_seqlen_k=max_seqlen,
         do_causal=False,
         scale=scale,
-        softcapping=1.0,
     )
-    # Compute ground truth
-    expected = torch.zeros_like(out)
-    for i in range(batch_size):
-        start, end = cu_seqlens[i].item(), cu_seqlens[i+1].item()
-        q_i = query[start:end]
-        k_i = key[start:end]
-        v_i = value[start:end]
-        # Compute attention for each head separately
-        for h in range(num_heads):
-            q_h = q_i[:, h, :]  # [seq_len, head_dim]
-            k_h = k_i[:, h, :]
-            v_h = v_i[:, h, :]
-            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-            attn_weights = torch.softmax(scores, dim=-1)
-            expected[start:end, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 with head_dim=128)
     if dtype == torch.bfloat16:
-        # bfloat16 with head_dim=128 has known precision issues
-        rtol, atol = 2e-2, 2e-2
     elif dtype == torch.float16:
-        rtol, atol = 2e-3, 2e-3
     else:
-        rtol, atol = 1e-3, 1e-3
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-def test_flash_attention_large_head_dim_causal(dtype):
-    """Test Flash Attention with head_dim=128 and causal masking."""
     torch.manual_seed(42)
-    # Test dimensions
-    seq_len = 48
     num_heads = 4
-    head_dim = 128
     # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens([seq_len])
     # Create input tensors
-    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
     # Scale factor
     scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention with causal mask
     out = torch.empty_like(query)
     metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
         value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=True,
         scale=scale,
         softcapping=1.0,
     )
-    # Compute ground truth with causal mask
-    expected = torch.zeros_like(out)
-    for h in range(num_heads):
-        q_h = query[:, h, :]  # [seq_len, head_dim]
-        k_h = key[:, h, :]
-        v_h = value[:, h, :]
-        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-        # Apply causal mask
-        causal_mask = torch.triu(torch.ones(seq_len, seq_len, device="mps"), diagonal=1).bool()
-        scores.masked_fill_(causal_mask, float("-inf"))
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 with head_dim=128)
     if dtype == torch.bfloat16:
-        # bfloat16 with head_dim=128 has known precision issues
-        rtol, atol = 2e-2, 2e-2
     elif dtype == torch.float16:
-        rtol, atol = 2e-3, 2e-3
     else:
-        rtol, atol = 1e-3, 1e-3
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-def test_flash_attention_large_head_dim_gqa():
-    """Test Flash Attention with head_dim=128 and GQA."""
-    torch.manual_seed(42)
-    # Test dimensions
-    seq_len = 32
-    num_heads = 16
-    num_kv_heads = 4  # GQA with 4:1 ratio
-    head_dim = 128
-    # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens([seq_len])
-    # Create input tensors
-    query = torch.randn(seq_len, num_heads, head_dim, dtype=torch.float32, device="mps")
-    key = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.float32, device="mps")
-    value = torch.randn(seq_len, num_kv_heads, head_dim, dtype=torch.float32, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Compute ground truth with GQA
-    expected = torch.zeros_like(query)
-    gqa_factor = num_heads // num_kv_heads
-    for h in range(num_heads):
-        kv_h = h // gqa_factor
-        q_h = query[:, h, :]  # [seq_len, head_dim]
-        k_h = key[:, kv_h, :]
-        v_h = value[:, kv_h, :]
-        scores = torch.matmul(q_h, k_h.transpose(-2, -1)) * scale
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results
-    torch.testing.assert_close(out, expected, rtol=1e-3, atol=1e-3)
 def test_flash_attention_edge_cases():
     """Test Flash Attention edge cases."""
     torch.manual_seed(42)
@@ -596,36 +388,13 @@ def test_flash_attention_unsupported_cases():
             softcapping=1.0,
         )
-    # Test 2: Calling function with wrong number of arguments
     query = torch.randn(16, 4, 64, device="mps")
     key = torch.randn(16, 4, 64, device="mps")
     value = torch.randn(16, 4, 64, device="mps")
-    mask = torch.randn(1, 1, 16, 16, device="mps")
-    cu_seqlens = create_cu_seqlens([16])
-    out = torch.empty_like(query)
-    # The function signature no longer accepts mask parameter
-    with pytest.raises(TypeError):
-        metal_flash_sdpa.flash_attention_varlen(
-            out=out,
-            query=query,
-            key=key,
-            value=value,
-            cu_seqlens_q=cu_seqlens,
-            cu_seqlens_k=cu_seqlens,
-            max_seqlen_q=16,
-            max_seqlen_k=16,
-            mask=mask,  # This parameter doesn't exist anymore
-            do_causal=False,
-            scale=0.125,
-            softcapping=1.0,
-        )
-    # Test 3: Wrong dtype for cu_seqlens (should be int32)
-    cu_seqlens_wrong = torch.tensor([0, 16], dtype=torch.int64, device="mps")
     # This will silently fail (output will be unchanged)
-    # We can detect this by initializing output to a known value
     out = torch.full_like(query, -999.0)
     metal_flash_sdpa.flash_attention_varlen(
         out=out,
@@ -645,300 +414,6 @@ def test_flash_attention_unsupported_cases():
     assert (out == -999.0).all(), "cu_seqlens with wrong dtype should cause kernel to not run"
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_small_sequences(dtype, head_dim):
-    """Test Flash Attention with small sequence lengths (2-8)."""
-    torch.manual_seed(42)
-    # Test different small sequence lengths
-    for seq_len in [2, 4, 6, 8]:
-        num_heads = 4
-        # Create cumulative sequence lengths
-        cu_seqlens = create_cu_seqlens([seq_len])
-        # Create input tensors
-        query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-        key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-        value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-        # Scale factor
-        scale = 1.0 / (head_dim ** 0.5)
-        # Call Flash Attention
-        out = torch.empty_like(query)
-        metal_flash_sdpa.flash_attention_varlen(
-            out=out,
-            query=query,
-            key=key,
-            value=value,
-            cu_seqlens_q=cu_seqlens,
-            cu_seqlens_k=cu_seqlens,
-            max_seqlen_q=seq_len,
-            max_seqlen_k=seq_len,
-            do_causal=False,
-            scale=scale,
-            softcapping=1.0,
-        )
-        # Compute ground truth
-        expected = torch.zeros_like(out)
-        for h in range(num_heads):
-            q_h = query[:, h, :]  # [seq_len, head_dim]
-            k_h = key[:, h, :]
-            v_h = value[:, h, :]
-            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-            attn_weights = torch.softmax(scores, dim=-1)
-            expected[:, h, :] = torch.matmul(attn_weights, v_h)
-        # Check results (higher tolerance for bfloat16)
-        if dtype == torch.bfloat16:
-            rtol, atol = 2e-2, 2e-2
-        elif dtype == torch.float16:
-            rtol, atol = 2e-3, 2e-3
-        else:
-            rtol, atol = 1e-3, 1e-3
-        torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_cross_attention(dtype, head_dim):
-    """Test Flash Attention with different q_seq and k_seq (cross-attention)."""
-    torch.manual_seed(42)
-    # Test various q_seq, k_seq combinations
-    test_cases = [
-        (16, 32),   # q_seq < k_seq
-        (32, 16),   # q_seq > k_seq
-        (8, 128),   # large difference
-        (1, 64),    # single query token
-    ]
-    for q_seq, k_seq in test_cases:
-        num_heads = 4
-        # Create cumulative sequence lengths
-        cu_seqlens_q = create_cu_seqlens([q_seq])
-        cu_seqlens_k = create_cu_seqlens([k_seq])
-        # Create input tensors
-        query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
-        key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
-        value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
-        # Scale factor
-        scale = 1.0 / (head_dim ** 0.5)
-        # Call Flash Attention
-        out = torch.empty_like(query)
-        metal_flash_sdpa.flash_attention_varlen(
-            out=out,
-            query=query,
-            key=key,
-            value=value,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=q_seq,
-            max_seqlen_k=k_seq,
-            do_causal=False,
-            scale=scale,
-            softcapping=1.0,
-        )
-        # Compute ground truth
-        expected = torch.zeros_like(out)
-        for h in range(num_heads):
-            q_h = query[:, h, :]  # [q_seq, head_dim]
-            k_h = key[:, h, :]    # [k_seq, head_dim]
-            v_h = value[:, h, :]  # [k_seq, head_dim]
-            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-            attn_weights = torch.softmax(scores, dim=-1)
-            expected[:, h, :] = torch.matmul(attn_weights, v_h)
-        # Check results (higher tolerance for bfloat16)
-        if dtype == torch.bfloat16:
-            rtol, atol = 2e-2, 2e-2
-        elif dtype == torch.float16:
-            rtol, atol = 2e-3, 2e-3
-        else:
-            rtol, atol = 1e-3, 1e-3
-        torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-def test_flash_attention_large_sequences(dtype):
-    """Test Flash Attention with large k_seq (>= 1024)."""
-    torch.manual_seed(42)
-    # Test dimensions - large k_seq to test 2-pass algorithms
-    q_seq = 32
-    k_seq = 2048  # Large k_seq
-    num_heads = 4
-    head_dim = 64  # Use smaller head_dim to avoid memory issues
-    # Create cumulative sequence lengths
-    cu_seqlens_q = create_cu_seqlens([q_seq])
-    cu_seqlens_k = create_cu_seqlens([k_seq])
-    # Create input tensors
-    query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens_q,
-        cu_seqlens_k=cu_seqlens_k,
-        max_seqlen_q=q_seq,
-        max_seqlen_k=k_seq,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Compute ground truth
-    expected = torch.zeros_like(out)
-    for h in range(num_heads):
-        q_h = query[:, h, :]  # [q_seq, head_dim]
-        k_h = key[:, h, :]    # [k_seq, head_dim]
-        v_h = value[:, h, :]  # [k_seq, head_dim]
-        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for large sequences)
-    if dtype == torch.bfloat16:
-        rtol, atol = 3e-2, 3e-2
-    elif dtype == torch.float16:
-        rtol, atol = 5e-3, 5e-3
-    else:
-        rtol, atol = 2e-3, 2e-3
-    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("gqa_ratio", [2, 4, 8])
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128])
-def test_flash_attention_gqa_ratios(gqa_ratio, head_dim):
-    """Test Flash Attention with different GQA ratios."""
-    torch.manual_seed(42)
-    # Test dimensions
-    seq_len = 32
-    num_heads = 16
-    num_kv_heads = num_heads // gqa_ratio
-    dtype = torch.float32
-    # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens([seq_len])
-    # Create input tensors
-    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(seq_len, num_kv_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(seq_len, num_kv_heads, head_dim, dtype=dtype, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Compute ground truth with GQA
-    expected = torch.zeros_like(query)
-    gqa_factor = num_heads // num_kv_heads
-    for h in range(num_heads):
-        kv_h = h // gqa_factor
-        q_h = query[:, h, :]  # [seq_len, head_dim]
-        k_h = key[:, kv_h, :]
-        v_h = value[:, kv_h, :]
-        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results
-    torch.testing.assert_close(out, expected, rtol=1e-3, atol=1e-3)
-def test_flash_attention_single_query_token():
-    """Test Flash Attention with single query token (q_seq = 1)."""
-    torch.manual_seed(42)
-    # Test dimensions - single query token
-    q_seq = 1
-    k_seq = 64
-    num_heads = 8
-    head_dim = 64
-    dtype = torch.float32
-    # Create cumulative sequence lengths
-    cu_seqlens_q = create_cu_seqlens([q_seq])
-    cu_seqlens_k = create_cu_seqlens([k_seq])
-    # Create input tensors
-    query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens_q,
-        cu_seqlens_k=cu_seqlens_k,
-        max_seqlen_q=q_seq,
-        max_seqlen_k=k_seq,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # With single token, output should be weighted average of values
-    expected = torch.zeros_like(out)
-    for h in range(num_heads):
-        q_h = query[:, h, :]  # [1, head_dim]
-        k_h = key[:, h, :]    # [k_seq, head_dim]
-        v_h = value[:, h, :]  # [k_seq, head_dim]
-        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    torch.testing.assert_close(out, expected, rtol=1e-3, atol=1e-3)
 def test_flash_attn_varlen_func():
     """Test the flash_attn_varlen_func compatibility function."""
     torch.manual_seed(42)
@@ -992,141 +467,3 @@ def test_flash_attn_varlen_func():
     assert out_causal.shape == q.shape
     assert out_causal.abs().max().item() > 0
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
-def test_flash_attention_softcapping(dtype, head_dim):
-    """Test Flash Attention with softcapping."""
-    torch.manual_seed(42)
-    # Test dimensions
-    seq_lengths = [32, 24]
-    num_heads = 4
-    softcapping = 50.0
-    # Create cumulative sequence lengths
-    cu_seqlens = create_cu_seqlens(seq_lengths)
-    total_tokens = sum(seq_lengths)
-    max_seqlen = max(seq_lengths)
-    # Create input tensors
-    query = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(total_tokens, num_heads, head_dim, dtype=dtype, device="mps")
-    # Scale factor
-    scale = 1.0 / (head_dim ** 0.5)
-    # Call Flash Attention with softcapping
-    out = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=max_seqlen,
-        max_seqlen_k=max_seqlen,
-        do_causal=False,
-        scale=scale,
-        softcapping=softcapping,
-    )
-    # Compute ground truth with softcapping
-    # The kernel applies: softmax(tanh(qk^T*scale/cap)*cap)v
-    expected = torch.zeros_like(query)
-    for i, (start, end) in enumerate(zip(cu_seqlens[:-1], cu_seqlens[1:])):
-        q_seq = query[start:end]
-        k_seq = key[start:end]
-        v_seq = value[start:end]
-        for h in range(num_heads):
-            q_h = q_seq[:, h, :]
-            k_h = k_seq[:, h, :]
-            v_h = v_seq[:, h, :]
-            # Apply softcapping formula
-            scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * (scale / softcapping)
-            scores = torch.tanh(scores) * softcapping
-            attn_weights = torch.softmax(scores, dim=-1)
-            expected[start:end, h, :] = torch.matmul(attn_weights, v_h)
-    # Check results (higher tolerance for bfloat16 and softcapping)
-    if dtype == torch.bfloat16:
-        rtol, atol = 3e-2, 3e-2
-    elif dtype == torch.float16:
-        rtol, atol = 2e-2, 2e-2
-    else:
-        rtol, atol = 1e-2, 1e-2
-    torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
-@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
-def test_flash_attention_softcapping_edge_cases(dtype):
-    """Test Flash Attention softcapping with edge cases."""
-    torch.manual_seed(42)
-    # Test with softcapping = 1.0 (no softcapping)
-    seq_len = 16
-    num_heads = 2
-    head_dim = 64
-    cu_seqlens = create_cu_seqlens([seq_len])
-    query = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    key = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    value = torch.randn(seq_len, num_heads, head_dim, dtype=dtype, device="mps")
-    scale = 1.0 / (head_dim ** 0.5)
-    # With softcapping = 1.0 (no effect)
-    out_no_cap = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out_no_cap,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=False,
-        scale=scale,
-        softcapping=1.0,
-    )
-    # Regular computation without softcapping
-    expected = torch.zeros_like(query)
-    for h in range(num_heads):
-        q_h = query[:, h, :]
-        k_h = key[:, h, :]
-        v_h = value[:, h, :]
-        scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
-        attn_weights = torch.softmax(scores, dim=-1)
-        expected[:, h, :] = torch.matmul(attn_weights, v_h)
-    # Should be identical when softcapping = 1.0
-    rtol, atol = (2e-2, 2e-2) if dtype != torch.float32 else (1e-3, 1e-3)
-    torch.testing.assert_close(out_no_cap, expected, rtol=rtol, atol=atol)
-    # Test with very large softcapping value
-    out_large_cap = torch.empty_like(query)
-    metal_flash_sdpa.flash_attention_varlen(
-        out=out_large_cap,
-        query=query,
-        key=key,
-        value=value,
-        cu_seqlens_q=cu_seqlens,
-        cu_seqlens_k=cu_seqlens,
-        max_seqlen_q=seq_len,
-        max_seqlen_k=seq_len,
-        do_causal=False,
-        scale=scale,
-        softcapping=1000.0,
-    )
-    # With very large softcapping, should be close to no softcapping
-    torch.testing.assert_close(out_large_cap, expected, rtol=rtol, atol=atol)

     return torch.tensor(cu_seqlens, dtype=torch.int32, device="mps")
+def compute_attention_reference(query, key, value, scale, causal=False, softcapping=1.0, gqa_ratio=1):
+    """Compute reference attention output for validation."""
+    num_heads = query.shape[1]
+    expected = torch.zeros_like(query)
     for h in range(num_heads):
+        kv_h = h // gqa_ratio if gqa_ratio > 1 else h
+        q_h = query[:, h, :]
+        k_h = key[:, kv_h, :]
+        v_h = value[:, kv_h, :]
         scores = torch.matmul(q_h, k_h.transpose(-1, -2)) * scale
+        # Apply softcapping if not 1.0
+        if softcapping != 1.0:
+            scores = scores / softcapping
+            scores = torch.tanh(scores) * softcapping
+        # Apply causal mask if needed
+        if causal:
+            seq_len = query.shape[0]
+            causal_mask = torch.triu(torch.ones(seq_len, seq_len, device="mps"), diagonal=1).bool()
+            scores.masked_fill_(causal_mask, float("-inf"))
         attn_weights = torch.softmax(scores, dim=-1)
         expected[:, h, :] = torch.matmul(attn_weights, v_h)
+    return expected
+def get_tolerance(dtype, head_dim):
+    """Get appropriate tolerance based on dtype and head dimension."""
     if dtype == torch.bfloat16:
+        return (2e-2, 2e-2) if head_dim >= 96 else (1e-2, 1e-2)
     elif dtype == torch.float16:
+        return (2e-3, 2e-3)
     else:
+        return (1e-3, 1e-3)
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+@pytest.mark.parametrize("seq_config", [
+    # (seq_lengths_q, seq_lengths_k, description)
+    ([32], [32], "single_sequence"),
+    ([8, 16, 12], [10, 20, 15], "variable_lengths"),
+    ([16, 24], [16, 24], "multiple_sequences"),
+    ([2], [2], "small_sequence_2"),
+    ([4], [4], "small_sequence_4"),
+    ([8], [8], "small_sequence_8"),
+    ([16], [32], "cross_attention_q_lt_k"),
+    ([32], [16], "cross_attention_q_gt_k"),
+    ([8], [128], "cross_attention_large_diff"),
+    ([1], [64], "single_query_token"),
+])
+@pytest.mark.parametrize("causal", [False, True])
+def test_flash_attention_comprehensive(dtype, head_dim, seq_config, causal):
+    """Comprehensive test for Flash Attention with various configurations."""
     torch.manual_seed(42)
+    seq_lengths_q, seq_lengths_k, _ = seq_config
+    # Skip causal tests for cross-attention cases
+    if causal and seq_lengths_q != seq_lengths_k:
+        pytest.skip("Causal attention only valid when q_seq == k_seq")
+    # Test parameters
     num_heads = 4
     # Create cumulative sequence lengths
         cu_seqlens_k=cu_seqlens_k,
         max_seqlen_q=max_seqlen_q,
         max_seqlen_k=max_seqlen_k,
+        do_causal=causal,
         scale=scale,
         softcapping=1.0,
     )
     # Compute ground truth for each sequence
     expected = torch.zeros_like(out)
+    batch_size = len(seq_lengths_q)
     for i in range(batch_size):
         q_start, q_end = cu_seqlens_q[i].item(), cu_seqlens_q[i+1].item()
         k_start, k_end = cu_seqlens_k[i].item(), cu_seqlens_k[i+1].item()
+        if q_end > q_start and k_end > k_start:  # Skip empty sequences
+            q_i = query[q_start:q_end]
+            k_i = key[k_start:k_end]
+            v_i = value[k_start:k_end]
+            expected_i = compute_attention_reference(q_i, k_i, v_i, scale, causal=causal)
+            expected[q_start:q_end] = expected_i
+    # Check results
+    rtol, atol = get_tolerance(dtype, head_dim)
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("head_dim", [32, 64, 72, 80, 96, 128, 256])
+@pytest.mark.parametrize("gqa_config", [
+    # (num_heads, num_kv_heads, seq_len)
+    (8, 2, 32),    # 4:1 ratio
+    (16, 4, 32),   # 4:1 ratio
+    (16, 8, 32),   # 2:1 ratio
+    (16, 2, 32),   # 8:1 ratio
+    (16, 4, 128),  # 4:1 ratio with larger sequence
+])
+def test_flash_attention_gqa(dtype, head_dim, gqa_config):
+    """Test Flash Attention with Grouped Query Attention configurations."""
     torch.manual_seed(42)
+    num_heads, num_kv_heads, seq_len = gqa_config
+    gqa_ratio = num_heads // num_kv_heads
     # Create cumulative sequence lengths
     cu_seqlens = create_cu_seqlens([seq_len])
     )
     # Compute ground truth with GQA
+    expected = compute_attention_reference(query, key, value, scale, gqa_ratio=gqa_ratio)
+    # Check results
+    rtol, atol = get_tolerance(dtype, head_dim)
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("softcapping_config", [
+    # (softcapping_value, seq_lengths, head_dim)
+    (1.0, [32], 64),        # No softcapping
+    (50.0, [32, 24], 64),   # Regular softcapping
+    (10.0, [16], 128),      # Strong softcapping
+    (1000.0, [16], 64),     # Very weak softcapping
+    (30.0, [48], 96),       # Medium softcapping
+])
+def test_flash_attention_softcapping(dtype, softcapping_config):
+    """Test Flash Attention with various softcapping values."""
     torch.manual_seed(42)
+    softcapping, seq_lengths, head_dim = softcapping_config
+    num_heads = 4
     # Create cumulative sequence lengths
     cu_seqlens = create_cu_seqlens(seq_lengths)
     # Scale factor
     scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention with softcapping
     out = torch.empty_like(query)
     metal_flash_sdpa.flash_attention_varlen(
         out=out,
         max_seqlen_k=max_seqlen,
         do_causal=False,
         scale=scale,
+        softcapping=softcapping,
     )
+    # Compute ground truth with softcapping
+    expected = torch.zeros_like(query)
+    for start, end in zip(cu_seqlens[:-1], cu_seqlens[1:]):
+        if end > start:
+            q_seq = query[start:end]
+            k_seq = key[start:end]
+            v_seq = value[start:end]
+            expected_seq = compute_attention_reference(
+                q_seq, k_seq, v_seq, scale, softcapping=softcapping
+            )
+            expected[start:end] = expected_seq
+    # Check results (higher tolerance for softcapping)
     if dtype == torch.bfloat16:
+        rtol, atol = 3e-2, 3e-2
     elif dtype == torch.float16:
+        rtol, atol = 2e-2, 2e-2
     else:
+        rtol, atol = 1e-2, 1e-2
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
+@pytest.mark.parametrize("large_seq_config", [
+    # (q_seq, k_seq, head_dim, dtype)
+    (32, 2048, 64, torch.float32),
+    (16, 1024, 96, torch.float16),
+    (64, 1536, 64, torch.bfloat16),
+])
+def test_flash_attention_large_sequences(large_seq_config):
+    """Test Flash Attention with large k sequences (>= 1024)."""
     torch.manual_seed(42)
+    q_seq, k_seq, head_dim, dtype = large_seq_config
     num_heads = 4
     # Create cumulative sequence lengths
+    cu_seqlens_q = create_cu_seqlens([q_seq])
+    cu_seqlens_k = create_cu_seqlens([k_seq])
     # Create input tensors
+    query = torch.randn(q_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    key = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
+    value = torch.randn(k_seq, num_heads, head_dim, dtype=dtype, device="mps")
     # Scale factor
     scale = 1.0 / (head_dim ** 0.5)
+    # Call Flash Attention
     out = torch.empty_like(query)
     metal_flash_sdpa.flash_attention_varlen(
         out=out,
         query=query,
         key=key,
         value=value,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_k=cu_seqlens_k,
+        max_seqlen_q=q_seq,
+        max_seqlen_k=k_seq,
+        do_causal=False,
         scale=scale,
         softcapping=1.0,
     )
+    # Compute ground truth
+    expected = compute_attention_reference(query, key, value, scale)
+    # Check results (higher tolerance for large sequences)
     if dtype == torch.bfloat16:
+        rtol, atol = 3e-2, 3e-2
     elif dtype == torch.float16:
+        rtol, atol = 5e-3, 5e-3
     else:
+        rtol, atol = 2e-3, 2e-3
     torch.testing.assert_close(out, expected, rtol=rtol, atol=atol)
 def test_flash_attention_edge_cases():
     """Test Flash Attention edge cases."""
     torch.manual_seed(42)
             softcapping=1.0,
         )
+    # Test 2: Wrong dtype for cu_seqlens (should be int32)
+    cu_seqlens_wrong = torch.tensor([0, 16], dtype=torch.int64, device="mps")
     query = torch.randn(16, 4, 64, device="mps")
     key = torch.randn(16, 4, 64, device="mps")
     value = torch.randn(16, 4, 64, device="mps")
     # This will silently fail (output will be unchanged)
     out = torch.full_like(query, -999.0)
     metal_flash_sdpa.flash_attention_varlen(
         out=out,
     assert (out == -999.0).all(), "cu_seqlens with wrong dtype should cause kernel to not run"
 def test_flash_attn_varlen_func():
     """Test the flash_attn_varlen_func compatibility function."""
     torch.manual_seed(42)
     assert out_causal.shape == q.shape
     assert out_causal.abs().max().item() > 0