magiccodingman commited on
Commit
f5a619b
·
verified ·
1 Parent(s): 3444fde

initial upload

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json +44 -0
  3. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md +11 -0
  4. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log +177 -0
  5. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log +177 -0
  6. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log +177 -0
  7. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/bench_metrics.json +44 -0
  8. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md +11 -0
  9. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log +177 -0
  10. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log +177 -0
  11. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log +177 -0
  12. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/bench_metrics.json +44 -0
  13. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md +11 -0
  14. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log +177 -0
  15. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log +177 -0
  16. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log +177 -0
  17. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  18. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  19. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +177 -0
  20. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +177 -0
  21. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +177 -0
  22. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  23. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  24. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +177 -0
  25. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +177 -0
  26. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +177 -0
  27. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  28. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  29. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +177 -0
  30. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +177 -0
  31. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +177 -0
  32. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/bench_metrics.json +44 -0
  33. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/llamabench.md +11 -0
  34. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_code.log +176 -0
  35. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_general.log +176 -0
  36. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_math.log +176 -0
  37. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json +44 -0
  38. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md +11 -0
  39. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log +176 -0
  40. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log +176 -0
  41. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log +176 -0
  42. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json +44 -0
  43. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md +11 -0
  44. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log +177 -0
  45. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log +177 -0
  46. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log +177 -0
  47. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/bench_metrics.json +44 -0
  48. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md +11 -0
  49. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log +177 -0
  50. Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log +177 -0
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ Qwen3-30B-A3B-Instruct-2507-IQ4_NL.gguf filter=lfs diff=lfs merge=lfs -text
38
+ Qwen3-30B-A3B-Instruct-2507-Q5_K.gguf filter=lfs diff=lfs merge=lfs -text
39
+ Qwen3-30B-A3B-Instruct-2507-iq4_nl-EHQKOUD-IQ4NL.gguf filter=lfs diff=lfs merge=lfs -text
40
+ Qwen3-30B-A3B-Instruct-2507-iq4_nl-EHQKOUD-Q8_0.gguf filter=lfs diff=lfs merge=lfs -text
41
+ Qwen3-30B-A3B-Instruct-2507-mxfp4_moe-H-B16-EUR-IQ4NL-KO-Q5K-QD-Q6K.gguf filter=lfs diff=lfs merge=lfs -text
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.07 GiB",
13
+ "t/s": "140.87 \u00b1 7.21",
14
+ "test": "pp8",
15
+ "tps_value": 140.87
16
+ },
17
+ "test": "pp8",
18
+ "tps": 140.87
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log",
23
+ "ppl": 1.3147,
24
+ "ppl_error": 0.00744
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log",
28
+ "ppl": 6.3651,
29
+ "ppl_error": 0.1303
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log",
33
+ "ppl": 5.743,
34
+ "ppl_error": 0.10634
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 1.1935,
40
+ "bench_tps": 140.87,
41
+ "file_size_bytes": 17263163392,
42
+ "file_size_gb": 16.08
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | pp8 | 140.87 ± 7.21 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | tg128 | 47.97 ± 0.28 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19988 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 117.855 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.36 seconds per pass - ETA 2.45 minutes
166
+ [1]1.6588,[2]1.5142,[3]1.3187,[4]1.2709,[5]1.3556,[6]1.4189,[7]1.4174,[8]1.4150,[9]1.3739,[10]1.3505,[11]1.3343,[12]1.3362,[13]1.3203,[14]1.3102,[15]1.3065,[16]1.2944,[17]1.2878,[18]1.2867,[19]1.2794,[20]1.2691,[21]1.2657,[22]1.2656,[23]1.2826,[24]1.2757,[25]1.2743,[26]1.2657,[27]1.2602,[28]1.2588,[29]1.2720,[30]1.2733,[31]1.2668,[32]1.2617,[33]1.2624,[34]1.2617,[35]1.2601,[36]1.2816,[37]1.2916,[38]1.2967,[39]1.3033,[40]1.3042,[41]1.3010,[42]1.3143,[43]1.3141,[44]1.3147,
167
+ Final estimate: PPL = 1.3147 +/- 0.00744
168
+
169
+ llama_perf_context_print: load time = 2663.26 ms
170
+ llama_perf_context_print: prompt eval time = 123334.59 ms / 90112 tokens ( 1.37 ms per token, 730.63 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124567.69 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15982 + (3896 = 3351 + 40 + 504) + 4236 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19983 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 52.329 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.29 seconds per pass - ETA 0.82 minutes
166
+ [1]5.3556,[2]6.4118,[3]6.8430,[4]6.7803,[5]6.6687,[6]5.7520,[7]5.2324,[8]5.2639,[9]5.5506,[10]5.6965,[11]5.7688,[12]6.0850,[13]6.1602,[14]6.2917,[15]6.3651,
167
+ Final estimate: PPL = 6.3651 +/- 0.13030
168
+
169
+ llama_perf_context_print: load time = 2647.55 ms
170
+ llama_perf_context_print: prompt eval time = 45441.66 ms / 30720 tokens ( 1.48 ms per token, 676.03 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45875.75 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15984 + (3896 = 3351 + 40 + 504) + 4234 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19984 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 45.284 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.37 seconds per pass - ETA 0.88 minutes
166
+ [1]4.6757,[2]5.0631,[3]5.3547,[4]5.4818,[5]5.6760,[6]5.6852,[7]5.6674,[8]5.6224,[9]5.6617,[10]5.6423,[11]5.6595,[12]5.6533,[13]5.7380,[14]5.7461,[15]5.7330,[16]5.7430,
167
+ Final estimate: PPL = 5.7430 +/- 0.10634
168
+
169
+ llama_perf_context_print: load time = 2432.27 ms
170
+ llama_perf_context_print: prompt eval time = 49391.86 ms / 32768 tokens ( 1.51 ms per token, 663.43 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49835.53 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15981 + (3896 = 3351 + 40 + 504) + 4237 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.11 GiB",
13
+ "t/s": "135.74 \u00b1 5.23",
14
+ "test": "pp8",
15
+ "tps_value": 135.74
16
+ },
17
+ "test": "pp8",
18
+ "tps": 135.74
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log",
23
+ "ppl": 1.3142,
24
+ "ppl_error": 0.00744
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log",
28
+ "ppl": 6.319,
29
+ "ppl_error": 0.12895
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log",
33
+ "ppl": 5.7257,
34
+ "ppl_error": 0.10573
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.8341,
40
+ "bench_tps": 135.74,
41
+ "file_size_bytes": 17304489984,
42
+ "file_size_gb": 16.12
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | pp8 | 135.74 ± 5.23 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | tg128 | 48.32 ± 0.46 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19989 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 115.464 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.30 seconds per pass - ETA 2.42 minutes
166
+ [1]1.6533,[2]1.5118,[3]1.3173,[4]1.2701,[5]1.3540,[6]1.4175,[7]1.4151,[8]1.4125,[9]1.3718,[10]1.3486,[11]1.3324,[12]1.3343,[13]1.3186,[14]1.3087,[15]1.3051,[16]1.2930,[17]1.2865,[18]1.2855,[19]1.2783,[20]1.2680,[21]1.2647,[22]1.2647,[23]1.2815,[24]1.2747,[25]1.2734,[26]1.2648,[27]1.2592,[28]1.2579,[29]1.2711,[30]1.2724,[31]1.2659,[32]1.2609,[33]1.2616,[34]1.2611,[35]1.2595,[36]1.2810,[37]1.2910,[38]1.2960,[39]1.3026,[40]1.3036,[41]1.3005,[42]1.3137,[43]1.3136,[44]1.3142,
167
+ Final estimate: PPL = 1.3142 +/- 0.00744
168
+
169
+ llama_perf_context_print: load time = 4365.32 ms
170
+ llama_perf_context_print: prompt eval time = 122274.24 ms / 90112 tokens ( 1.36 ms per token, 736.97 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 123498.16 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15940 + (3935 = 3351 + 40 + 544) + 4239 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19985 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 50.79 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.29 seconds per pass - ETA 0.82 minutes
166
+ [1]5.3106,[2]6.3757,[3]6.7987,[4]6.7346,[5]6.6164,[6]5.7094,[7]5.1946,[8]5.2234,[9]5.5077,[10]5.6519,[11]5.7246,[12]6.0393,[13]6.1152,[14]6.2474,[15]6.3190,
167
+ Final estimate: PPL = 6.3190 +/- 0.12895
168
+
169
+ llama_perf_context_print: load time = 2484.87 ms
170
+ llama_perf_context_print: prompt eval time = 45222.21 ms / 30720 tokens ( 1.47 ms per token, 679.31 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45651.06 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15944 + (3935 = 3351 + 40 + 544) + 4235 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19981 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 43.338 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.35 seconds per pass - ETA 0.88 minutes
166
+ [1]4.6798,[2]5.0540,[3]5.3400,[4]5.4661,[5]5.6520,[6]5.6638,[7]5.6465,[8]5.6008,[9]5.6432,[10]5.6246,[11]5.6416,[12]5.6352,[13]5.7182,[14]5.7245,[15]5.7141,[16]5.7257,
167
+ Final estimate: PPL = 5.7257 +/- 0.10573
168
+
169
+ llama_perf_context_print: load time = 2467.87 ms
170
+ llama_perf_context_print: prompt eval time = 49118.45 ms / 32768 tokens ( 1.50 ms per token, 667.12 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49563.63 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15980 + (3935 = 3351 + 40 + 544) + 4199 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.18 GiB",
13
+ "t/s": "144.56 \u00b1 9.14",
14
+ "test": "pp8",
15
+ "tps_value": 144.56
16
+ },
17
+ "test": "pp8",
18
+ "tps": 144.56
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log",
23
+ "ppl": 1.3142,
24
+ "ppl_error": 0.00744
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log",
28
+ "ppl": 6.3155,
29
+ "ppl_error": 0.12881
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log",
33
+ "ppl": 5.7194,
34
+ "ppl_error": 0.10557
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.7787,
40
+ "bench_tps": 144.56,
41
+ "file_size_bytes": 17379850240,
42
+ "file_size_gb": 16.19
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | pp8 | 144.56 ± 9.14 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | tg128 | 44.01 ± 0.27 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20015 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 114.349 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.33 seconds per pass - ETA 2.43 minutes
166
+ [1]1.6565,[2]1.5134,[3]1.3182,[4]1.2707,[5]1.3548,[6]1.4179,[7]1.4156,[8]1.4129,[9]1.3722,[10]1.3488,[11]1.3327,[12]1.3346,[13]1.3190,[14]1.3089,[15]1.3053,[16]1.2932,[17]1.2867,[18]1.2856,[19]1.2785,[20]1.2682,[21]1.2649,[22]1.2648,[23]1.2817,[24]1.2749,[25]1.2736,[26]1.2650,[27]1.2594,[28]1.2582,[29]1.2713,[30]1.2727,[31]1.2661,[32]1.2611,[33]1.2619,[34]1.2613,[35]1.2597,[36]1.2812,[37]1.2911,[38]1.2962,[39]1.3027,[40]1.3037,[41]1.3005,[42]1.3137,[43]1.3136,[44]1.3142,
167
+ Final estimate: PPL = 1.3142 +/- 0.00744
168
+
169
+ llama_perf_context_print: load time = 4820.37 ms
170
+ llama_perf_context_print: prompt eval time = 122876.25 ms / 90112 tokens ( 1.36 ms per token, 733.36 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 124103.45 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15901 + (4007 = 3351 + 40 + 616) + 4206 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20015 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 47.786 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.31 seconds per pass - ETA 0.82 minutes
166
+ [1]5.3084,[2]6.3611,[3]6.7890,[4]6.7222,[5]6.6072,[6]5.7012,[7]5.1893,[8]5.2196,[9]5.5058,[10]5.6496,[11]5.7213,[12]6.0346,[13]6.1112,[14]6.2425,[15]6.3155,
167
+ Final estimate: PPL = 6.3155 +/- 0.12881
168
+
169
+ llama_perf_context_print: load time = 2465.82 ms
170
+ llama_perf_context_print: prompt eval time = 45681.14 ms / 30720 tokens ( 1.49 ms per token, 672.49 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 46106.99 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15894 + (4007 = 3351 + 40 + 616) + 4213 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20017 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q8_0.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 616.05 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 46.047 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.40 seconds per pass - ETA 0.90 minutes
166
+ [1]4.6668,[2]5.0427,[3]5.3293,[4]5.4538,[5]5.6404,[6]5.6524,[7]5.6372,[8]5.5917,[9]5.6348,[10]5.6157,[11]5.6336,[12]5.6289,[13]5.7113,[14]5.7188,[15]5.7084,[16]5.7194,
167
+ Final estimate: PPL = 5.7194 +/- 0.10557
168
+
169
+ llama_perf_context_print: load time = 2673.47 ms
170
+ llama_perf_context_print: prompt eval time = 49732.47 ms / 32768 tokens ( 1.52 ms per token, 658.89 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 50182.87 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15901 + (4007 = 3351 + 40 + 616) + 4206 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.07 GiB",
13
+ "t/s": "148.92 \u00b1 13.32",
14
+ "test": "pp8",
15
+ "tps_value": 148.92
16
+ },
17
+ "test": "pp8",
18
+ "tps": 148.92
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.3169,
24
+ "ppl_error": 0.00749
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.4867,
29
+ "ppl_error": 0.13379
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8717,
34
+ "ppl_error": 0.11
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.6491,
40
+ "bench_tps": 148.92,
41
+ "file_size_bytes": 17263163392,
42
+ "file_size_gb": 16.08
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | pp8 | 148.92 ± 13.32 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.07 GiB | 30.53 B | CUDA | 35 | tg128 | 53.67 ± 0.94 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19983 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 110.015 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.38 seconds per pass - ETA 2.47 minutes
166
+ [1]1.6598,[2]1.5146,[3]1.3189,[4]1.2705,[5]1.3552,[6]1.4178,[7]1.4158,[8]1.4153,[9]1.3743,[10]1.3513,[11]1.3351,[12]1.3367,[13]1.3210,[14]1.3110,[15]1.3074,[16]1.2952,[17]1.2884,[18]1.2874,[19]1.2802,[20]1.2698,[21]1.2664,[22]1.2665,[23]1.2834,[24]1.2765,[25]1.2750,[26]1.2665,[27]1.2609,[28]1.2598,[29]1.2733,[30]1.2745,[31]1.2677,[32]1.2628,[33]1.2637,[34]1.2632,[35]1.2617,[36]1.2836,[37]1.2935,[38]1.2986,[39]1.3053,[40]1.3065,[41]1.3033,[42]1.3167,[43]1.3164,[44]1.3169,
167
+ Final estimate: PPL = 1.3169 +/- 0.00749
168
+
169
+ llama_perf_context_print: load time = 2440.91 ms
170
+ llama_perf_context_print: prompt eval time = 122582.01 ms / 90112 tokens ( 1.36 ms per token, 735.12 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 123977.87 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16028 + (3859 = 3351 + 40 + 467) + 4228 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19987 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 49.148 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.28 seconds per pass - ETA 0.82 minutes
166
+ [1]5.3929,[2]6.4305,[3]6.9142,[4]6.8317,[5]6.7305,[6]5.8036,[7]5.2903,[8]5.3328,[9]5.6317,[10]5.7865,[11]5.8716,[12]6.1944,[13]6.2677,[14]6.4056,[15]6.4867,
167
+ Final estimate: PPL = 6.4867 +/- 0.13379
168
+
169
+ llama_perf_context_print: load time = 2446.56 ms
170
+ llama_perf_context_print: prompt eval time = 45273.88 ms / 30720 tokens ( 1.47 ms per token, 678.54 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45820.18 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16017 + (3859 = 3351 + 40 + 467) + 4239 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19994 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.07 GiB (4.52 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9754.91 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 43.928 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.33 seconds per pass - ETA 0.88 minutes
166
+ [1]4.7319,[2]5.1657,[3]5.4640,[4]5.5884,[5]5.7779,[6]5.7941,[7]5.8003,[8]5.7493,[9]5.7908,[10]5.7746,[11]5.7839,[12]5.7817,[13]5.8662,[14]5.8718,[15]5.8606,[16]5.8717,
167
+ Final estimate: PPL = 5.8717 +/- 0.11000
168
+
169
+ llama_perf_context_print: load time = 2432.02 ms
170
+ llama_perf_context_print: prompt eval time = 48851.56 ms / 32768 tokens ( 1.49 ms per token, 670.77 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49299.22 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16016 + (3859 = 3351 + 40 + 467) + 4239 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9874 = 9754 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.11 GiB",
13
+ "t/s": "149.72 \u00b1 9.10",
14
+ "test": "pp8",
15
+ "tps_value": 149.72
16
+ },
17
+ "test": "pp8",
18
+ "tps": 149.72
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.3168,
24
+ "ppl_error": 0.00749
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.4899,
29
+ "ppl_error": 0.13391
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8703,
34
+ "ppl_error": 0.10999
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.6554,
40
+ "bench_tps": 149.72,
41
+ "file_size_bytes": 17304489984,
42
+ "file_size_gb": 16.12
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | pp8 | 149.72 ± 9.10 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | tg128 | 53.07 ± 0.88 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20030 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 111.732 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.39 seconds per pass - ETA 2.48 minutes
166
+ [1]1.6523,[2]1.5131,[3]1.3180,[4]1.2703,[5]1.3554,[6]1.4181,[7]1.4162,[8]1.4153,[9]1.3745,[10]1.3520,[11]1.3355,[12]1.3371,[13]1.3213,[14]1.3113,[15]1.3079,[16]1.2957,[17]1.2886,[18]1.2876,[19]1.2804,[20]1.2701,[21]1.2667,[22]1.2667,[23]1.2835,[24]1.2765,[25]1.2750,[26]1.2665,[27]1.2608,[28]1.2595,[29]1.2729,[30]1.2744,[31]1.2677,[32]1.2626,[33]1.2636,[34]1.2631,[35]1.2616,[36]1.2837,[37]1.2936,[38]1.2986,[39]1.3053,[40]1.3065,[41]1.3032,[42]1.3165,[43]1.3162,[44]1.3168,
167
+ Final estimate: PPL = 1.3168 +/- 0.00749
168
+
169
+ llama_perf_context_print: load time = 3331.49 ms
170
+ llama_perf_context_print: prompt eval time = 122240.38 ms / 90112 tokens ( 1.36 ms per token, 737.17 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 123445.05 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16013 + (3859 = 3351 + 40 + 467) + 4242 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20033 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 47.638 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.23 seconds per pass - ETA 0.80 minutes
166
+ [1]5.4006,[2]6.4533,[3]6.9294,[4]6.8403,[5]6.7427,[6]5.8140,[7]5.2989,[8]5.3361,[9]5.6330,[10]5.7846,[11]5.8710,[12]6.1947,[13]6.2702,[14]6.4074,[15]6.4899,
167
+ Final estimate: PPL = 6.4899 +/- 0.13391
168
+
169
+ llama_perf_context_print: load time = 2509.43 ms
170
+ llama_perf_context_print: prompt eval time = 44905.08 ms / 30720 tokens ( 1.46 ms per token, 684.11 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45329.65 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16063 + (3859 = 3351 + 40 + 467) + 4193 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19975 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9794.32 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 44.601 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.35 seconds per pass - ETA 0.88 minutes
166
+ [1]4.7230,[2]5.1625,[3]5.4537,[4]5.5804,[5]5.7711,[6]5.7913,[7]5.7991,[8]5.7491,[9]5.7912,[10]5.7743,[11]5.7827,[12]5.7809,[13]5.8656,[14]5.8711,[15]5.8586,[16]5.8703,
167
+ Final estimate: PPL = 5.8703 +/- 0.10999
168
+
169
+ llama_perf_context_print: load time = 2446.50 ms
170
+ llama_perf_context_print: prompt eval time = 49294.99 ms / 32768 tokens ( 1.50 ms per token, 664.73 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49739.97 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16009 + (3859 = 3351 + 40 + 467) + 4246 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9914 = 9794 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.18 GiB",
13
+ "t/s": "136.79 \u00b1 3.81",
14
+ "test": "pp8",
15
+ "tps_value": 136.79
16
+ },
17
+ "test": "pp8",
18
+ "tps": 136.79
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.316,
24
+ "ppl_error": 0.00747
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.4962,
29
+ "ppl_error": 0.13413
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.866,
34
+ "ppl_error": 0.10985
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.6434,
40
+ "bench_tps": 136.79,
41
+ "file_size_bytes": 17379850240,
42
+ "file_size_gb": 16.19
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | pp8 | 136.79 ± 3.81 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.18 GiB | 30.53 B | CUDA | 35 | tg128 | 53.55 ± 0.95 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20020 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 114.79 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.45 seconds per pass - ETA 2.52 minutes
166
+ [1]1.6426,[2]1.5078,[3]1.3150,[4]1.2668,[5]1.3514,[6]1.4145,[7]1.4129,[8]1.4130,[9]1.3730,[10]1.3502,[11]1.3340,[12]1.3358,[13]1.3200,[14]1.3102,[15]1.3062,[16]1.2941,[17]1.2873,[18]1.2862,[19]1.2791,[20]1.2689,[21]1.2656,[22]1.2656,[23]1.2825,[24]1.2756,[25]1.2740,[26]1.2656,[27]1.2599,[28]1.2587,[29]1.2720,[30]1.2735,[31]1.2668,[32]1.2618,[33]1.2628,[34]1.2622,[35]1.2607,[36]1.2826,[37]1.2925,[38]1.2976,[39]1.3043,[40]1.3054,[41]1.3022,[42]1.3154,[43]1.3154,[44]1.3160,
167
+ Final estimate: PPL = 1.3160 +/- 0.00747
168
+
169
+ llama_perf_context_print: load time = 2446.33 ms
170
+ llama_perf_context_print: prompt eval time = 121716.77 ms / 90112 tokens ( 1.35 ms per token, 740.34 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 122927.34 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16052 + (3859 = 3351 + 40 + 467) + 4203 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20013 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 50.642 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.23 seconds per pass - ETA 0.80 minutes
166
+ [1]5.4169,[2]6.4513,[3]6.9292,[4]6.8455,[5]6.7441,[6]5.8162,[7]5.3007,[8]5.3377,[9]5.6366,[10]5.7897,[11]5.8763,[12]6.2005,[13]6.2764,[14]6.4136,[15]6.4962,
167
+ Final estimate: PPL = 6.4962 +/- 0.13413
168
+
169
+ llama_perf_context_print: load time = 2436.70 ms
170
+ llama_perf_context_print: prompt eval time = 44879.31 ms / 30720 tokens ( 1.46 ms per token, 684.50 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45304.13 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16059 + (3859 = 3351 + 40 + 467) + 4196 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20021 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-IQ4_NL-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q8_0-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q8_0: 1 tensors
52
+ llama_model_loader: - type iq4_nl: 337 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.18 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9866.19 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 46.252 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.36 seconds per pass - ETA 0.88 minutes
166
+ [1]4.7155,[2]5.1535,[3]5.4445,[4]5.5709,[5]5.7586,[6]5.7786,[7]5.7883,[8]5.7388,[9]5.7821,[10]5.7648,[11]5.7739,[12]5.7729,[13]5.8580,[14]5.8646,[15]5.8540,[16]5.8660,
167
+ Final estimate: PPL = 5.8660 +/- 0.10985
168
+
169
+ llama_perf_context_print: load time = 12329.82 ms
170
+ llama_perf_context_print: prompt eval time = 49263.01 ms / 32768 tokens ( 1.50 ms per token, 665.16 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49704.06 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16050 + (3859 = 3351 + 40 + 467) + 4205 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9986 = 9866 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/llamabench.md",
6
+ "ngl": "30",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "30",
11
+ "params": "30.53 B",
12
+ "size": "56.89 GiB",
13
+ "t/s": "50.77 \u00b1 2.28",
14
+ "test": "pp8",
15
+ "tps_value": 50.77
16
+ },
17
+ "test": "pp8",
18
+ "tps": 50.77
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_code.log",
23
+ "ppl": 1.2981,
24
+ "ppl_error": 0.00721
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_general.log",
28
+ "ppl": 6.2581,
29
+ "ppl_error": 0.12787
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_math.log",
33
+ "ppl": 5.7092,
34
+ "ppl_error": 0.10643
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.0,
40
+ "bench_tps": 50.77,
41
+ "file_size_bytes": 61095802880,
42
+ "file_size_gb": 56.9
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 56.89 GiB | 30.53 B | CUDA | 30 | pp8 | 50.77 ± 2.28 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 56.89 GiB | 30.53 B | CUDA | 30 | tg128 | 16.29 ± 0.05 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_code.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19998 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type bf16: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 56.89 GiB (16.01 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 34479.47 MiB
126
+ load_tensors: CUDA0 model buffer size = 11890.17 MiB
127
+ load_tensors: CUDA1 model buffer size = 11890.17 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 894.25 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 109.623 ms
163
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 9.86 seconds per pass - ETA 7.22 minutes
165
+ [1]1.5220,[2]1.4172,[3]1.2617,[4]1.2254,[5]1.3143,[6]1.3791,[7]1.3822,[8]1.3823,[9]1.3445,[10]1.3242,[11]1.3088,[12]1.3113,[13]1.2968,[14]1.2887,[15]1.2836,[16]1.2728,[17]1.2662,[18]1.2645,[19]1.2584,[20]1.2488,[21]1.2466,[22]1.2469,[23]1.2641,[24]1.2576,[25]1.2557,[26]1.2481,[27]1.2429,[28]1.2420,[29]1.2550,[30]1.2563,[31]1.2503,[32]1.2456,[33]1.2465,[34]1.2463,[35]1.2453,[36]1.2661,[37]1.2759,[38]1.2806,[39]1.2870,[40]1.2876,[41]1.2846,[42]1.2976,[43]1.2977,[44]1.2981,
166
+ Final estimate: PPL = 1.2981 +/- 0.00721
167
+
168
+ llama_perf_context_print: load time = 7427.57 ms
169
+ llama_perf_context_print: prompt eval time = 382844.28 ms / 90112 tokens ( 4.25 ms per token, 235.38 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 384456.73 ms / 90113 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 6972 + (12824 = 11890 + 40 + 894) + 4317 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 11454 + (12012 = 11890 + 40 + 82) + 657 |
176
+ llama_memory_breakdown_print: | - Host | 34599 = 34479 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_general.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19995 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type bf16: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 56.89 GiB (16.01 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 34479.47 MiB
126
+ load_tensors: CUDA0 model buffer size = 11890.17 MiB
127
+ load_tensors: CUDA1 model buffer size = 11890.17 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 894.25 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 48.239 ms
163
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 9.79 seconds per pass - ETA 2.43 minutes
165
+ [1]5.2211,[2]6.2740,[3]6.6780,[4]6.6452,[5]6.5500,[6]5.6565,[7]5.1561,[8]5.1786,[9]5.4630,[10]5.6137,[11]5.6793,[12]5.9848,[13]6.0595,[14]6.1889,[15]6.2581,
166
+ Final estimate: PPL = 6.2581 +/- 0.12787
167
+
168
+ llama_perf_context_print: load time = 7637.09 ms
169
+ llama_perf_context_print: prompt eval time = 141916.79 ms / 30720 tokens ( 4.62 ms per token, 216.46 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 142341.06 ms / 30721 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 6970 + (12824 = 11890 + 40 + 894) + 4320 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 11454 + (12012 = 11890 + 40 + 82) + 657 |
176
+ llama_memory_breakdown_print: | - Host | 34599 = 34479 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16/perplexity_math.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20033 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_BF16-attn_output_BF16-attn_q_BF16-embeddings_BF16-ffn_down_BF16-ffn_up_gate_BF16-lm_head_BF16.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type bf16: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 56.89 GiB (16.01 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 34479.47 MiB
126
+ load_tensors: CUDA0 model buffer size = 11890.17 MiB
127
+ load_tensors: CUDA1 model buffer size = 11890.17 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 894.25 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 45.968 ms
163
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 10.02 seconds per pass - ETA 2.67 minutes
165
+ [1]4.6596,[2]5.0312,[3]5.3327,[4]5.4646,[5]5.6536,[6]5.6505,[7]5.6284,[8]5.5859,[9]5.6357,[10]5.6152,[11]5.6296,[12]5.6274,[13]5.6995,[14]5.7048,[15]5.6972,[16]5.7092,
166
+ Final estimate: PPL = 5.7092 +/- 0.10643
167
+
168
+ llama_perf_context_print: load time = 8705.98 ms
169
+ llama_perf_context_print: prompt eval time = 154163.15 ms / 32768 tokens ( 4.70 ms per token, 212.55 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 154613.26 ms / 32769 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 7009 + (12824 = 11890 + 40 + 894) + 4281 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 11454 + (12012 = 11890 + 40 + 82) + 657 |
176
+ llama_memory_breakdown_print: | - Host | 34599 = 34479 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.04 GiB",
13
+ "t/s": "149.76 \u00b1 10.70",
14
+ "test": "pp8",
15
+ "tps_value": 149.76
16
+ },
17
+ "test": "pp8",
18
+ "tps": 149.76
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log",
23
+ "ppl": 1.317,
24
+ "ppl_error": 0.00748
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log",
28
+ "ppl": 6.4836,
29
+ "ppl_error": 0.13372
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log",
33
+ "ppl": 5.8712,
34
+ "ppl_error": 0.10993
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 2.6323,
40
+ "bench_tps": 149.76,
41
+ "file_size_bytes": 17224267776,
42
+ "file_size_gb": 16.04
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.04 GiB | 30.53 B | CUDA | 35 | pp8 | 149.76 ± 10.70 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.04 GiB | 30.53 B | CUDA | 35 | tg128 | 52.72 ± 0.50 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_code.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19990 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type iq4_nl: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 16.04 GiB (4.51 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 9717.82 MiB
126
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
127
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 111.405 ms
163
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 3.31 seconds per pass - ETA 2.42 minutes
165
+ [1]1.6579,[2]1.5180,[3]1.3209,[4]1.2720,[5]1.3570,[6]1.4195,[7]1.4171,[8]1.4164,[9]1.3755,[10]1.3526,[11]1.3359,[12]1.3375,[13]1.3215,[14]1.3115,[15]1.3078,[16]1.2957,[17]1.2891,[18]1.2880,[19]1.2809,[20]1.2706,[21]1.2672,[22]1.2672,[23]1.2841,[24]1.2771,[25]1.2756,[26]1.2670,[27]1.2614,[28]1.2601,[29]1.2735,[30]1.2748,[31]1.2681,[32]1.2631,[33]1.2640,[34]1.2636,[35]1.2619,[36]1.2839,[37]1.2938,[38]1.2989,[39]1.3057,[40]1.3068,[41]1.3035,[42]1.3168,[43]1.3165,[44]1.3170,
166
+ Final estimate: PPL = 1.3170 +/- 0.00748
167
+
168
+ llama_perf_context_print: load time = 18846.62 ms
169
+ llama_perf_context_print: prompt eval time = 121891.20 ms / 90112 tokens ( 1.35 ms per token, 739.28 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 123116.76 ms / 90113 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16023 + (3859 = 3351 + 40 + 467) + 4232 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
176
+ llama_memory_breakdown_print: | - Host | 9837 = 9717 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_general.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19984 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type iq4_nl: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 16.04 GiB (4.51 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 9717.82 MiB
126
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
127
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 49.295 ms
163
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 3.27 seconds per pass - ETA 0.80 minutes
165
+ [1]5.4075,[2]6.4591,[3]6.9472,[4]6.8525,[5]6.7446,[6]5.8142,[7]5.2960,[8]5.3340,[9]5.6299,[10]5.7805,[11]5.8647,[12]6.1884,[13]6.2628,[14]6.4011,[15]6.4836,
166
+ Final estimate: PPL = 6.4836 +/- 0.13372
167
+
168
+ llama_perf_context_print: load time = 2441.68 ms
169
+ llama_perf_context_print: prompt eval time = 45325.67 ms / 30720 tokens ( 1.48 ms per token, 677.76 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 45752.51 ms / 30721 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16025 + (3859 = 3351 + 40 + 467) + 4230 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
176
+ llama_memory_breakdown_print: | - Host | 9837 = 9717 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL/perplexity_math.log ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 19988 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_IQ4_NL-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_IQ4_NL.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type iq4_nl: 338 tensors
52
+ print_info: file format = GGUF V3 (latest)
53
+ print_info: file type = IQ4_NL - 4.5 bpw
54
+ print_info: file size = 16.04 GiB (4.51 BPW)
55
+ load: printing all EOG tokens:
56
+ load: - 151643 ('<|endoftext|>')
57
+ load: - 151645 ('<|im_end|>')
58
+ load: - 151662 ('<|fim_pad|>')
59
+ load: - 151663 ('<|repo_name|>')
60
+ load: - 151664 ('<|file_sep|>')
61
+ load: special tokens cache size = 26
62
+ load: token to piece cache size = 0.9311 MB
63
+ print_info: arch = qwen3moe
64
+ print_info: vocab_only = 0
65
+ print_info: n_ctx_train = 262144
66
+ print_info: n_embd = 2048
67
+ print_info: n_embd_inp = 2048
68
+ print_info: n_layer = 48
69
+ print_info: n_head = 32
70
+ print_info: n_head_kv = 4
71
+ print_info: n_rot = 128
72
+ print_info: n_swa = 0
73
+ print_info: is_swa_any = 0
74
+ print_info: n_embd_head_k = 128
75
+ print_info: n_embd_head_v = 128
76
+ print_info: n_gqa = 8
77
+ print_info: n_embd_k_gqa = 512
78
+ print_info: n_embd_v_gqa = 512
79
+ print_info: f_norm_eps = 0.0e+00
80
+ print_info: f_norm_rms_eps = 1.0e-06
81
+ print_info: f_clamp_kqv = 0.0e+00
82
+ print_info: f_max_alibi_bias = 0.0e+00
83
+ print_info: f_logit_scale = 0.0e+00
84
+ print_info: f_attn_scale = 0.0e+00
85
+ print_info: n_ff = 6144
86
+ print_info: n_expert = 128
87
+ print_info: n_expert_used = 8
88
+ print_info: n_expert_groups = 0
89
+ print_info: n_group_used = 0
90
+ print_info: causal attn = 1
91
+ print_info: pooling type = 0
92
+ print_info: rope type = 2
93
+ print_info: rope scaling = linear
94
+ print_info: freq_base_train = 10000000.0
95
+ print_info: freq_scale_train = 1
96
+ print_info: n_ctx_orig_yarn = 262144
97
+ print_info: rope_finetuned = unknown
98
+ print_info: model type = 30B.A3B
99
+ print_info: model params = 30.53 B
100
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
101
+ print_info: n_ff_exp = 768
102
+ print_info: vocab type = BPE
103
+ print_info: n_vocab = 151936
104
+ print_info: n_merges = 151387
105
+ print_info: BOS token = 11 ','
106
+ print_info: EOS token = 151645 '<|im_end|>'
107
+ print_info: EOT token = 151645 '<|im_end|>'
108
+ print_info: PAD token = 151654 '<|vision_pad|>'
109
+ print_info: LF token = 198 'Ċ'
110
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
111
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
112
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
113
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
114
+ print_info: FIM REP token = 151663 '<|repo_name|>'
115
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
116
+ print_info: EOG token = 151643 '<|endoftext|>'
117
+ print_info: EOG token = 151645 '<|im_end|>'
118
+ print_info: EOG token = 151662 '<|fim_pad|>'
119
+ print_info: EOG token = 151663 '<|repo_name|>'
120
+ print_info: EOG token = 151664 '<|file_sep|>'
121
+ print_info: max token length = 256
122
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
123
+ load_tensors: offloading 20 repeating layers to GPU
124
+ load_tensors: offloaded 20/49 layers to GPU
125
+ load_tensors: CPU_Mapped model buffer size = 9717.82 MiB
126
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
127
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
128
+ ....................................................................................................
129
+ llama_context: constructing llama_context
130
+ llama_context: n_seq_max = 1
131
+ llama_context: n_ctx = 2048
132
+ llama_context: n_ctx_seq = 2048
133
+ llama_context: n_batch = 2048
134
+ llama_context: n_ubatch = 512
135
+ llama_context: causal_attn = 1
136
+ llama_context: flash_attn = auto
137
+ llama_context: kv_unified = false
138
+ llama_context: freq_base = 10000000.0
139
+ llama_context: freq_scale = 1
140
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
141
+ llama_context: CPU output buffer size = 0.58 MiB
142
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
143
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
144
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
146
+ llama_context: Flash Attention was auto, set to enabled
147
+ llama_context: CUDA0 compute buffer size = 467.67 MiB
148
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
149
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
150
+ llama_context: graph nodes = 3031
151
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
152
+ common_init_from_params: added <|endoftext|> logit bias = -inf
153
+ common_init_from_params: added <|im_end|> logit bias = -inf
154
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
155
+ common_init_from_params: added <|repo_name|> logit bias = -inf
156
+ common_init_from_params: added <|file_sep|> logit bias = -inf
157
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
158
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
159
+
160
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
161
+ perplexity: tokenizing the input ..
162
+ perplexity: tokenization took 46.365 ms
163
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
164
+ perplexity: 3.33 seconds per pass - ETA 0.88 minutes
165
+ [1]4.7198,[2]5.1598,[3]5.4569,[4]5.5862,[5]5.7761,[6]5.7932,[7]5.8007,[8]5.7482,[9]5.7891,[10]5.7722,[11]5.7809,[12]5.7795,[13]5.8647,[14]5.8702,[15]5.8592,[16]5.8712,
166
+ Final estimate: PPL = 5.8712 +/- 0.10993
167
+
168
+ llama_perf_context_print: load time = 5771.72 ms
169
+ llama_perf_context_print: prompt eval time = 48900.16 ms / 32768 tokens ( 1.49 ms per token, 670.10 tokens per second)
170
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
171
+ llama_perf_context_print: total time = 49448.91 ms / 32769 tokens
172
+ llama_perf_context_print: graphs reused = 0
173
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
174
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16058 + (3859 = 3351 + 40 + 467) + 4197 |
175
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
176
+ llama_memory_breakdown_print: | - Host | 9837 = 9717 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.11 GiB",
13
+ "t/s": "147.04 \u00b1 7.66",
14
+ "test": "pp8",
15
+ "tps_value": 147.04
16
+ },
17
+ "test": "pp8",
18
+ "tps": 147.04
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log",
23
+ "ppl": 1.3146,
24
+ "ppl_error": 0.00745
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log",
28
+ "ppl": 6.3693,
29
+ "ppl_error": 0.13041
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log",
33
+ "ppl": 5.744,
34
+ "ppl_error": 0.10641
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 1.2192,
40
+ "bench_tps": 147.04,
41
+ "file_size_bytes": 17302059008,
42
+ "file_size_gb": 16.11
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | pp8 | 147.04 ± 7.66 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.11 GiB | 30.53 B | CUDA | 35 | tg128 | 50.99 ± 0.20 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20032 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 2 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9792.00 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 113.731 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.29 seconds per pass - ETA 2.40 minutes
166
+ [1]1.6594,[2]1.5100,[3]1.3163,[4]1.2690,[5]1.3534,[6]1.4169,[7]1.4158,[8]1.4136,[9]1.3725,[10]1.3490,[11]1.3332,[12]1.3352,[13]1.3196,[14]1.3096,[15]1.3059,[16]1.2938,[17]1.2871,[18]1.2861,[19]1.2787,[20]1.2682,[21]1.2648,[22]1.2648,[23]1.2818,[24]1.2750,[25]1.2736,[26]1.2652,[27]1.2596,[28]1.2584,[29]1.2716,[30]1.2729,[31]1.2663,[32]1.2613,[33]1.2620,[34]1.2614,[35]1.2598,[36]1.2813,[37]1.2912,[38]1.2963,[39]1.3028,[40]1.3038,[41]1.3008,[42]1.3141,[43]1.3140,[44]1.3146,
167
+ Final estimate: PPL = 1.3146 +/- 0.00745
168
+
169
+ llama_perf_context_print: load time = 6244.55 ms
170
+ llama_perf_context_print: prompt eval time = 122397.64 ms / 90112 tokens ( 1.36 ms per token, 736.22 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 123764.79 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16024 + (3896 = 3351 + 40 + 504) + 4195 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9912 = 9792 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20029 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 2 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9792.00 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 46.885 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.27 seconds per pass - ETA 0.82 minutes
166
+ [1]5.3423,[2]6.3850,[3]6.8131,[4]6.7616,[5]6.6560,[6]5.7421,[7]5.2266,[8]5.2623,[9]5.5530,[10]5.7034,[11]5.7767,[12]6.0915,[13]6.1660,[14]6.2968,[15]6.3693,
167
+ Final estimate: PPL = 6.3693 +/- 0.13041
168
+
169
+ llama_perf_context_print: load time = 2474.12 ms
170
+ llama_perf_context_print: prompt eval time = 45370.43 ms / 30720 tokens ( 1.48 ms per token, 677.09 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45789.79 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16024 + (3896 = 3351 + 40 + 504) + 4194 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9912 = 9792 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K/perplexity_math.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20033 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q5_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q5_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q5_K: 2 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.11 GiB (4.53 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9792.00 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 504.77 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 46.221 ms
164
+ perplexity: calculating perplexity over 16 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.38 seconds per pass - ETA 0.90 minutes
166
+ [1]4.6880,[2]5.0684,[3]5.3607,[4]5.4838,[5]5.6770,[6]5.6853,[7]5.6675,[8]5.6233,[9]5.6632,[10]5.6445,[11]5.6627,[12]5.6554,[13]5.7393,[14]5.7476,[15]5.7348,[16]5.7440,
167
+ Final estimate: PPL = 5.7440 +/- 0.10641
168
+
169
+ llama_perf_context_print: load time = 3935.39 ms
170
+ llama_perf_context_print: prompt eval time = 49215.66 ms / 32768 tokens ( 1.50 ms per token, 665.80 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 49746.10 ms / 32769 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 16030 + (3896 = 3351 + 40 + 504) + 4188 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9912 = 9792 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/bench_metrics.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "raw_metrics": {
3
+ "llamabench": {
4
+ "backend": "CUDA",
5
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md",
6
+ "ngl": "35",
7
+ "raw_row": {
8
+ "backend": "CUDA",
9
+ "model": "qwen3moe 30B.A3B IQ4_NL - 4.5 bpw",
10
+ "ngl": "35",
11
+ "params": "30.53 B",
12
+ "size": "16.19 GiB",
13
+ "t/s": "141.45 \u00b1 4.77",
14
+ "test": "pp8",
15
+ "tps_value": 141.45
16
+ },
17
+ "test": "pp8",
18
+ "tps": 141.45
19
+ },
20
+ "perplexity": {
21
+ "code": {
22
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log",
23
+ "ppl": 1.3139,
24
+ "ppl_error": 0.00744
25
+ },
26
+ "general": {
27
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log",
28
+ "ppl": 6.3259,
29
+ "ppl_error": 0.12917
30
+ },
31
+ "math": {
32
+ "log_path": "Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_math.log",
33
+ "ppl": 5.7252,
34
+ "ppl_error": 0.1058
35
+ }
36
+ }
37
+ },
38
+ "summary": {
39
+ "avg_prec_loss_pct": 0.8603,
40
+ "bench_tps": 141.45,
41
+ "file_size_bytes": 17384712192,
42
+ "file_size_gb": 16.19
43
+ }
44
+ }
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/llamabench.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ | model | size | params | backend | ngl | test | t/s |
7
+ | ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
8
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.19 GiB | 30.53 B | CUDA | 35 | pp8 | 141.45 ± 4.77 |
9
+ | qwen3moe 30B.A3B IQ4_NL - 4.5 bpw | 16.19 GiB | 30.53 B | CUDA | 35 | tg128 | 49.00 ± 0.61 |
10
+
11
+ build: 92bb442ad (7040)
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_code.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20036 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 2 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.19 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9870.83 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 111.997 ms
164
+ perplexity: calculating perplexity over 44 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.30 seconds per pass - ETA 2.42 minutes
166
+ [1]1.6475,[2]1.5069,[3]1.3144,[4]1.2683,[5]1.3524,[6]1.4161,[7]1.4142,[8]1.4114,[9]1.3709,[10]1.3481,[11]1.3319,[12]1.3339,[13]1.3185,[14]1.3086,[15]1.3052,[16]1.2930,[17]1.2861,[18]1.2851,[19]1.2779,[20]1.2675,[21]1.2642,[22]1.2643,[23]1.2809,[24]1.2741,[25]1.2728,[26]1.2643,[27]1.2586,[28]1.2574,[29]1.2704,[30]1.2720,[31]1.2654,[32]1.2604,[33]1.2612,[34]1.2606,[35]1.2591,[36]1.2808,[37]1.2907,[38]1.2957,[39]1.3022,[40]1.3033,[41]1.3001,[42]1.3134,[43]1.3133,[44]1.3139,
167
+ Final estimate: PPL = 1.3139 +/- 0.00744
168
+
169
+ llama_perf_context_print: load time = 2995.21 ms
170
+ llama_perf_context_print: prompt eval time = 122323.71 ms / 90112 tokens ( 1.36 ms per token, 736.67 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 123529.78 ms / 90113 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15995 + (3935 = 3351 + 40 + 544) + 4184 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9990 = 9870 + 112 + 8 |
Benchmarks/DataCollection/Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K/perplexity_general.log ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
2
+ ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
3
+ ggml_cuda_init: found 2 CUDA devices:
4
+ Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
5
+ Device 1: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes
6
+ build: 7040 (92bb442ad) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu
7
+ llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 3090) (0000:01:00.0) - 20033 MiB free
8
+ llama_model_load_from_file_impl: using device CUDA1 (NVIDIA GeForce RTX 3090) (0000:03:00.0) - 23582 MiB free
9
+ llama_model_loader: loaded meta data with 39 key-value pairs and 579 tensors from /mnt/world8/AI/Models/Qwen3-30B-A3B-Instruct-2507-unsloth/Magic_Quant/GGUF/dc_round0_Qwen3-30B-A3B-Instruct-2507-unsloth-iq4_nl-attn_kv_IQ4_NL-attn_output_IQ4_NL-attn_q_IQ4_NL-embeddings_Q6_K-ffn_down_IQ4_NL-ffn_up_gate_IQ4_NL-lm_head_Q6_K.gguf (version GGUF V3 (latest))
10
+ llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
11
+ llama_model_loader: - kv 0: general.architecture str = qwen3moe
12
+ llama_model_loader: - kv 1: general.type str = model
13
+ llama_model_loader: - kv 2: general.name str = Qwen3 30B A3B Instruct 2507 Unsloth
14
+ llama_model_loader: - kv 3: general.version str = 2507
15
+ llama_model_loader: - kv 4: general.finetune str = Instruct-unsloth
16
+ llama_model_loader: - kv 5: general.basename str = Qwen3
17
+ llama_model_loader: - kv 6: general.size_label str = 30B-A3B
18
+ llama_model_loader: - kv 7: general.license str = apache-2.0
19
+ llama_model_loader: - kv 8: general.license.link str = https://huggingface.co/Qwen/Qwen3-30B...
20
+ llama_model_loader: - kv 9: general.base_model.count u32 = 1
21
+ llama_model_loader: - kv 10: general.base_model.0.name str = Qwen3 30B A3B Instruct 2507
22
+ llama_model_loader: - kv 11: general.base_model.0.version str = 2507
23
+ llama_model_loader: - kv 12: general.base_model.0.organization str = Qwen
24
+ llama_model_loader: - kv 13: general.base_model.0.repo_url str = https://huggingface.co/Qwen/Qwen3-30B...
25
+ llama_model_loader: - kv 14: general.tags arr[str,2] = ["unsloth", "text-generation"]
26
+ llama_model_loader: - kv 15: qwen3moe.block_count u32 = 48
27
+ llama_model_loader: - kv 16: qwen3moe.context_length u32 = 262144
28
+ llama_model_loader: - kv 17: qwen3moe.embedding_length u32 = 2048
29
+ llama_model_loader: - kv 18: qwen3moe.feed_forward_length u32 = 6144
30
+ llama_model_loader: - kv 19: qwen3moe.attention.head_count u32 = 32
31
+ llama_model_loader: - kv 20: qwen3moe.attention.head_count_kv u32 = 4
32
+ llama_model_loader: - kv 21: qwen3moe.rope.freq_base f32 = 10000000.000000
33
+ llama_model_loader: - kv 22: qwen3moe.attention.layer_norm_rms_epsilon f32 = 0.000001
34
+ llama_model_loader: - kv 23: qwen3moe.expert_used_count u32 = 8
35
+ llama_model_loader: - kv 24: qwen3moe.attention.key_length u32 = 128
36
+ llama_model_loader: - kv 25: qwen3moe.attention.value_length u32 = 128
37
+ llama_model_loader: - kv 26: qwen3moe.expert_count u32 = 128
38
+ llama_model_loader: - kv 27: qwen3moe.expert_feed_forward_length u32 = 768
39
+ llama_model_loader: - kv 28: tokenizer.ggml.model str = gpt2
40
+ llama_model_loader: - kv 29: tokenizer.ggml.pre str = qwen2
41
+ llama_model_loader: - kv 30: tokenizer.ggml.tokens arr[str,151936] = ["!", "\"", "#", "$", "%", "&", "'", ...
42
+ llama_model_loader: - kv 31: tokenizer.ggml.token_type arr[i32,151936] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
43
+ llama_model_loader: - kv 32: tokenizer.ggml.merges arr[str,151387] = ["Ġ Ġ", "ĠĠ ĠĠ", "i n", "Ġ t",...
44
+ llama_model_loader: - kv 33: tokenizer.ggml.eos_token_id u32 = 151645
45
+ llama_model_loader: - kv 34: tokenizer.ggml.padding_token_id u32 = 151654
46
+ llama_model_loader: - kv 35: tokenizer.ggml.add_bos_token bool = false
47
+ llama_model_loader: - kv 36: tokenizer.chat_template str = {%- if tools %}\n {{- '<|im_start|>...
48
+ llama_model_loader: - kv 37: general.quantization_version u32 = 2
49
+ llama_model_loader: - kv 38: general.file_type u32 = 25
50
+ llama_model_loader: - type f32: 241 tensors
51
+ llama_model_loader: - type q6_K: 2 tensors
52
+ llama_model_loader: - type iq4_nl: 336 tensors
53
+ print_info: file format = GGUF V3 (latest)
54
+ print_info: file type = IQ4_NL - 4.5 bpw
55
+ print_info: file size = 16.19 GiB (4.55 BPW)
56
+ load: printing all EOG tokens:
57
+ load: - 151643 ('<|endoftext|>')
58
+ load: - 151645 ('<|im_end|>')
59
+ load: - 151662 ('<|fim_pad|>')
60
+ load: - 151663 ('<|repo_name|>')
61
+ load: - 151664 ('<|file_sep|>')
62
+ load: special tokens cache size = 26
63
+ load: token to piece cache size = 0.9311 MB
64
+ print_info: arch = qwen3moe
65
+ print_info: vocab_only = 0
66
+ print_info: n_ctx_train = 262144
67
+ print_info: n_embd = 2048
68
+ print_info: n_embd_inp = 2048
69
+ print_info: n_layer = 48
70
+ print_info: n_head = 32
71
+ print_info: n_head_kv = 4
72
+ print_info: n_rot = 128
73
+ print_info: n_swa = 0
74
+ print_info: is_swa_any = 0
75
+ print_info: n_embd_head_k = 128
76
+ print_info: n_embd_head_v = 128
77
+ print_info: n_gqa = 8
78
+ print_info: n_embd_k_gqa = 512
79
+ print_info: n_embd_v_gqa = 512
80
+ print_info: f_norm_eps = 0.0e+00
81
+ print_info: f_norm_rms_eps = 1.0e-06
82
+ print_info: f_clamp_kqv = 0.0e+00
83
+ print_info: f_max_alibi_bias = 0.0e+00
84
+ print_info: f_logit_scale = 0.0e+00
85
+ print_info: f_attn_scale = 0.0e+00
86
+ print_info: n_ff = 6144
87
+ print_info: n_expert = 128
88
+ print_info: n_expert_used = 8
89
+ print_info: n_expert_groups = 0
90
+ print_info: n_group_used = 0
91
+ print_info: causal attn = 1
92
+ print_info: pooling type = 0
93
+ print_info: rope type = 2
94
+ print_info: rope scaling = linear
95
+ print_info: freq_base_train = 10000000.0
96
+ print_info: freq_scale_train = 1
97
+ print_info: n_ctx_orig_yarn = 262144
98
+ print_info: rope_finetuned = unknown
99
+ print_info: model type = 30B.A3B
100
+ print_info: model params = 30.53 B
101
+ print_info: general.name = Qwen3 30B A3B Instruct 2507 Unsloth
102
+ print_info: n_ff_exp = 768
103
+ print_info: vocab type = BPE
104
+ print_info: n_vocab = 151936
105
+ print_info: n_merges = 151387
106
+ print_info: BOS token = 11 ','
107
+ print_info: EOS token = 151645 '<|im_end|>'
108
+ print_info: EOT token = 151645 '<|im_end|>'
109
+ print_info: PAD token = 151654 '<|vision_pad|>'
110
+ print_info: LF token = 198 'Ċ'
111
+ print_info: FIM PRE token = 151659 '<|fim_prefix|>'
112
+ print_info: FIM SUF token = 151661 '<|fim_suffix|>'
113
+ print_info: FIM MID token = 151660 '<|fim_middle|>'
114
+ print_info: FIM PAD token = 151662 '<|fim_pad|>'
115
+ print_info: FIM REP token = 151663 '<|repo_name|>'
116
+ print_info: FIM SEP token = 151664 '<|file_sep|>'
117
+ print_info: EOG token = 151643 '<|endoftext|>'
118
+ print_info: EOG token = 151645 '<|im_end|>'
119
+ print_info: EOG token = 151662 '<|fim_pad|>'
120
+ print_info: EOG token = 151663 '<|repo_name|>'
121
+ print_info: EOG token = 151664 '<|file_sep|>'
122
+ print_info: max token length = 256
123
+ load_tensors: loading model tensors, this can take a while... (mmap = true)
124
+ load_tensors: offloading 20 repeating layers to GPU
125
+ load_tensors: offloaded 20/49 layers to GPU
126
+ load_tensors: CPU_Mapped model buffer size = 9870.83 MiB
127
+ load_tensors: CUDA0 model buffer size = 3351.42 MiB
128
+ load_tensors: CUDA1 model buffer size = 3351.42 MiB
129
+ ....................................................................................................
130
+ llama_context: constructing llama_context
131
+ llama_context: n_seq_max = 1
132
+ llama_context: n_ctx = 2048
133
+ llama_context: n_ctx_seq = 2048
134
+ llama_context: n_batch = 2048
135
+ llama_context: n_ubatch = 512
136
+ llama_context: causal_attn = 1
137
+ llama_context: flash_attn = auto
138
+ llama_context: kv_unified = false
139
+ llama_context: freq_base = 10000000.0
140
+ llama_context: freq_scale = 1
141
+ llama_context: n_ctx_seq (2048) < n_ctx_train (262144) -- the full capacity of the model will not be utilized
142
+ llama_context: CPU output buffer size = 0.58 MiB
143
+ llama_kv_cache: CPU KV buffer size = 112.00 MiB
144
+ llama_kv_cache: CUDA0 KV buffer size = 40.00 MiB
145
+ llama_kv_cache: CUDA1 KV buffer size = 40.00 MiB
146
+ llama_kv_cache: size = 192.00 MiB ( 2048 cells, 48 layers, 1/1 seqs), K (f16): 96.00 MiB, V (f16): 96.00 MiB
147
+ llama_context: Flash Attention was auto, set to enabled
148
+ llama_context: CUDA0 compute buffer size = 544.18 MiB
149
+ llama_context: CUDA1 compute buffer size = 82.01 MiB
150
+ llama_context: CUDA_Host compute buffer size = 8.01 MiB
151
+ llama_context: graph nodes = 3031
152
+ llama_context: graph splits = 397 (with bs=512), 88 (with bs=1)
153
+ common_init_from_params: added <|endoftext|> logit bias = -inf
154
+ common_init_from_params: added <|im_end|> logit bias = -inf
155
+ common_init_from_params: added <|fim_pad|> logit bias = -inf
156
+ common_init_from_params: added <|repo_name|> logit bias = -inf
157
+ common_init_from_params: added <|file_sep|> logit bias = -inf
158
+ common_init_from_params: setting dry_penalty_last_n to ctx_size = 2048
159
+ common_init_from_params: warming up the model with an empty run - please wait ... (--no-warmup to disable)
160
+
161
+ system_info: n_threads = 16 (n_threads_batch = 16) / 32 | CUDA : ARCHS = 860 | USE_GRAPHS = 1 | PEER_MAX_BATCH_SIZE = 128 | CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | BMI2 = 1 | AVX512 = 1 | AVX512_VBMI = 1 | AVX512_VNNI = 1 | AVX512_BF16 = 1 | LLAMAFILE = 1 | OPENMP = 1 | REPACK = 1 |
162
+ perplexity: tokenizing the input ..
163
+ perplexity: tokenization took 46.053 ms
164
+ perplexity: calculating perplexity over 15 chunks, n_ctx=2048, batch_size=2048, n_seq=1
165
+ perplexity: 3.26 seconds per pass - ETA 0.80 minutes
166
+ [1]5.3047,[2]6.3712,[3]6.7826,[4]6.7231,[5]6.6149,[6]5.7094,[7]5.1974,[8]5.2258,[9]5.5114,[10]5.6566,[11]5.7313,[12]6.0462,[13]6.1231,[14]6.2544,[15]6.3259,
167
+ Final estimate: PPL = 6.3259 +/- 0.12917
168
+
169
+ llama_perf_context_print: load time = 2446.78 ms
170
+ llama_perf_context_print: prompt eval time = 45163.19 ms / 30720 tokens ( 1.47 ms per token, 680.20 tokens per second)
171
+ llama_perf_context_print: eval time = 0.00 ms / 1 runs ( 0.00 ms per token, inf tokens per second)
172
+ llama_perf_context_print: total time = 45582.25 ms / 30721 tokens
173
+ llama_perf_context_print: graphs reused = 0
174
+ llama_memory_breakdown_print: | memory breakdown [MiB] | total free self model context compute unaccounted |
175
+ llama_memory_breakdown_print: | - CUDA0 (RTX 3090) | 24115 = 15992 + (3935 = 3351 + 40 + 544) + 4187 |
176
+ llama_memory_breakdown_print: | - CUDA1 (RTX 3090) | 24124 = 20000 + (3473 = 3351 + 40 + 82) + 650 |
177
+ llama_memory_breakdown_print: | - Host | 9990 = 9870 + 112 + 8 |