dacorvo HF Staff commited on
Commit
25e9ebd
·
1 Parent(s): cdc796b

add trn2 cached configs subdirectory

Browse files
inference-cache-config/trn2/llama3.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "meta-llama/Llama-3.1-8B": [
3
+ {
4
+ "batch_size": 1,
5
+ "sequence_length": 4096,
6
+ "tensor_parallel_size": 4,
7
+ "instance_type" : "trn2"
8
+ },
9
+ {
10
+ "batch_size": 8,
11
+ "sequence_length": 4096,
12
+ "tensor_parallel_size" : 4,
13
+ "instance_type" : "trn2"
14
+ },
15
+ {
16
+ "batch_size": 32,
17
+ "sequence_length": 4096,
18
+ "tensor_parallel_size": 4,
19
+ "instance_type" : "trn2"
20
+ },
21
+ {
22
+ "batch_size": 64,
23
+ "sequence_length": 4096,
24
+ "tensor_parallel_size": 4,
25
+ "instance_type" : "trn2"
26
+ }
27
+ ],
28
+ "meta-llama/Llama-3.2-1B": [
29
+ {
30
+ "batch_size": 1,
31
+ "sequence_length": 4096,
32
+ "tensor_parallel_size" : 4,
33
+ "instance_type" : "trn2"
34
+ },
35
+ {
36
+ "batch_size": 4,
37
+ "sequence_length": 4096,
38
+ "tensor_parallel_size" : 4,
39
+ "instance_type" : "trn2"
40
+ },
41
+ {
42
+ "batch_size": 64,
43
+ "sequence_length": 4096,
44
+ "tensor_parallel_size" : 4,
45
+ "instance_type" : "trn2"
46
+ }
47
+ ],
48
+ "meta-llama/Llama-3.2-3B": [
49
+ {
50
+ "batch_size": 1,
51
+ "sequence_length": 4096,
52
+ "tensor_parallel_size" : 4,
53
+ "instance_type" : "trn2"
54
+ },
55
+ {
56
+ "batch_size": 64,
57
+ "sequence_length": 4096,
58
+ "tensor_parallel_size" : 4,
59
+ "instance_type" : "trn2"
60
+ }
61
+ ],
62
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0": [
63
+ {
64
+ "batch_size": 1,
65
+ "sequence_length": 2048,
66
+ "tensor_parallel_size" : 4,
67
+ "instance_type" : "trn2"
68
+ }
69
+ ],
70
+ "meta-llama/Llama-3.3-70B-Instruct": [
71
+ {
72
+ "batch_size": 32,
73
+ "sequence_length": 4096,
74
+ "tensor_parallel_size": 32,
75
+ "instance_type" : "trn2"
76
+ },
77
+ {
78
+ "batch_size": 1,
79
+ "sequence_length": 4096,
80
+ "tensor_parallel_size": 64,
81
+ "instance_type" : "trn2"
82
+ },
83
+ {
84
+ "batch_size": 32,
85
+ "sequence_length": 4096,
86
+ "tensor_parallel_size": 64,
87
+ "instance_type" : "trn2"
88
+ },
89
+ {
90
+ "batch_size": 64,
91
+ "sequence_length": 4096,
92
+ "tensor_parallel_size": 64,
93
+ "instance_type" : "trn2"
94
+ }
95
+ ]
96
+ }
inference-cache-config/trn2/llama4.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "meta-llama/Llama-4-Scout-17B-16E-Instruct": [
3
+ {
4
+ "task": "text-generation",
5
+ "instance_type": "trn2",
6
+ "batch_size": 1,
7
+ "sequence_length": 4096,
8
+ "tensor_parallel_size": 32
9
+ },
10
+ {
11
+ "task": "text-generation",
12
+ "instance_type": "trn2",
13
+ "batch_size": 8,
14
+ "sequence_length": 4096,
15
+ "tensor_parallel_size": 32
16
+ },
17
+ {
18
+ "task": "text-generation",
19
+ "instance_type": "trn2",
20
+ "batch_size": 32,
21
+ "sequence_length": 4096,
22
+ "tensor_parallel_size": 32
23
+ }
24
+ ],
25
+ "meta-llama/Llama-4-Maverick-17B-128E-Instruct": [
26
+ {
27
+ "task": "text-generation",
28
+ "instance_type": "trn2",
29
+ "batch_size": 1,
30
+ "sequence_length": 4096,
31
+ "tensor_parallel_size": 64
32
+ },
33
+ {
34
+ "task": "text-generation",
35
+ "instance_type": "trn1",
36
+ "batch_size": 8,
37
+ "sequence_length": 4096,
38
+ "tensor_parallel_size": 64
39
+ },
40
+ {
41
+ "task": "text-generation",
42
+ "instance_type": "trn2",
43
+ "batch_size": 32,
44
+ "sequence_length": 4096,
45
+ "tensor_parallel_size": 64
46
+ }
47
+ ]
48
+ }
inference-cache-config/trn2/qwen3-moe.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Qwen/Qwen3-30B-A3B-Instruct-2507": [
3
+ {
4
+ "batch_size": 1,
5
+ "sequence_length": 4096,
6
+ "tensor_parallel_size": 4,
7
+ "instance_type" : "trn2"
8
+ },
9
+ {
10
+ "batch_size": 8,
11
+ "sequence_length": 4096,
12
+ "tensor_parallel_size": 4,
13
+ "instance_type" : "trn2"
14
+ },
15
+ {
16
+ "batch_size": 16,
17
+ "sequence_length": 4096,
18
+ "tensor_parallel_size": 4,
19
+ "instance_type" : "trn2"
20
+ }
21
+ ],
22
+ "Qwen/Qwen3-235B-A22B-Instruct-2507": [
23
+ {
24
+ "batch_size": 1,
25
+ "sequence_length": 4096,
26
+ "tensor_parallel_size": 64,
27
+ "instance_type" : "trn2"
28
+ },
29
+ {
30
+ "batch_size": 4,
31
+ "sequence_length": 4096,
32
+ "tensor_parallel_size": 64,
33
+ "instance_type" : "trn2"
34
+ }
35
+ ]
36
+ }