UnstableLlama commited on 26 days ago

Commit

d7000b1

verified ·

1 Parent(s): 1707f48

Upload 17 files

Browse files

Files changed (18) hide show

.gitattributes +2 -0
README.md +546 -187
args.json +467 -0
chat_template.jinja +103 -0
config.json +55 -0
generation_config.json +10 -0
model-00001-of-00007.safetensors +3 -0
model-00002-of-00007.safetensors +3 -0
model-00003-of-00007.safetensors +3 -0
model-00004-of-00007.safetensors +3 -0
model-00005-of-00007.safetensors +3 -0
model-00006-of-00007.safetensors +3 -0
model-00007-of-00007.safetensors +3 -0
model.safetensors.index.json +0 -0
quantization_config.json +3 -0
special_tokens_map.json +40 -0
tokenizer.json +3 -0
tokenizer_config.json +325 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 repoGraph.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 repoGraph.png filter=lfs diff=lfs merge=lfs -text
+quantization_config.json filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,212 +1,571 @@
 ---
-base_model: zerofata/GLM-4.5-Iceblink-v2-106B-A12B
-base_model_relation: quantized
-quantized_by: UnstableLlama
 license: mit
 datasets:
 - zerofata/Instruct-Anime
 - zerofata/Roleplay-Anime-Characters
 - zerofata/Instruct-Anime-CreativeWriting
 - zerofata/Summaries-Anime-FandomPages
-tags:
-- exl3
 ---
 <style>
-  @import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;700&display=swap');
-  .test-container {
-    font-family: 'JetBrains Mono', 'Fira Code', monospace;
-    background-color: #0d0d0d;
-    color: #00ff9f; /* Neon Green text base */
-    padding: 25px;
-    border: 1px solid #333;
-    border-radius: 4px;
-  }
-  .test-header {
-    border-bottom: 2px solid #ff00ff; /* Magenta accent */
-    margin-bottom: 30px;
-    padding-bottom: 15px;
-  }
-  .test-header h1 {
-    color: #ff00ff;
-    font-size: 1.8em;
-    text-transform: uppercase;
-    letter-spacing: 1px;
-    margin: 0 0 10px 0;
-    text-shadow: 3px 3px 0px rgba(0, 255, 159, 0.2);
-  }
-  .test-meta {
-    display: flex;
-    gap: 15px;
-    font-size: 0.9em;
-    opacity: 0.8;
-  }
-  .test-card {
-    border: 1px solid #00ff9f;
-    margin-bottom: 25px;
-    background-color: #111;
-    box-shadow: 5px 5px 0px #ff00ff; /* Hard magenta shadow for depth */
-  }
-  .test-card h2 {
-    background-color: #00ff9f;
-    color: #0d0d0d;
-    margin: 0;
-    padding: 8px 15px;
-    text-transform: uppercase;
-    font-size: 1.1em;
-    font-weight: 800;
-    letter-spacing: 1px;
-  }
-  .test-content {
-    padding: 20px;
-  }
-  /* Glitch Table Styling */
-  .glitch-table-wrapper {
-    width: fit-content;                /* Shrinks box to fit content */
-    min-width: 400px;                  /* keeps it from getting too small */
-    border: 1px solid rgba(0, 255, 159, 0.4); /* slightly brighter border base */
-    border-radius: 6px;
-    padding: 15px;
-    margin-top: 20px;
-}
-  .glitch-table th {
-    border-bottom: 2px solid #ff00ff;
-    color: #ff00ff;
-    padding: 12px;
-    font-weight: 700;
-    text-transform: uppercase;
-    font-size: 0.9em;
-  }
-  .glitch-table td {
-    border-bottom: 1px dashed #333;
-    padding: 12px;
-    transition: background 0.2s;
-  }
-  .glitch-table tr:hover td {
-    background-color: #1a1a1a;
-    color: #fff;
-  }
-  /* Link Styling */
-  a.test-link {
-    color: #00ff9f;
-    text-decoration: none;
-    font-weight: 700;
-    border-bottom: 1px dotted #00ff9f;
-    transition: all 0.2s;
-  }
-  a.test-link:hover {
-    background-color: #ff00ff;
-    color: #0d0d0d;
-    border-bottom: none;
-    box-shadow: 3px 3px 0px #00ff9f;
-  }
-  /* Code Block Styling */
-  .cmd-block {
-    background: #000;
-    border: 1px solid #333;
-    border-left: 3px solid #ff00ff;
-    color: #e0e0e0;
-    padding: 15px;
-    font-size: 0.9em;
-    overflow-x: auto;
-    white-space: pre-wrap;
-  }
 </style>
-<div class="test-container">
-  <div class="test-header">
-    <h1>>> zerofata's GLM-4.5-Iceblink-v2-106B-A12B-exl3</h1>
-    <div class="test-meta">
-      <div>[BASE :: GLM-4.5-AIR]</div>
-      <div>[TUNE :: ZEROFATA]</div>
-      <div>[QUANT :: UNSTABLELLAMA]</div>
     </div>
   </div>
-  <div class="test-card">
-    <h2>// REPO</h2>
-    <div class="test-content">
-      EXL3 quantization of <b><a class="test-link" href="https://huggingface.co/zerofata/GLM-4.5-Iceblink-v2-106B-A12B" target="_blank">zerofata's ICEBLINK-v2</a></b>.
-      <br><br>
-      Quantized with <b><a class="test-link" href="https://github.com/turboderp-org/exllamav3" target="_blank">exllamav3 0.0.12</a></b>.
     </div>
   </div>
-  <div class="test-card">
-    <h2>// QUANTS</h2>
-    <div class="test-content">
-      <table class="glitch-table">
-        <thead>
-          <tr>
-            <th>[BRANCH]</th>
-            <th>[GiB]</th>
-            <th>[K/L_DIV]</th>
-            <th>[PPL]</th>
-          </tr>
-        </thead>
-        <tbody>
-          <tr>
-            <td><a class="test-link" href="https://huggingface.co/UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3/tree/2.05" target="_blank">2.05bpw</a></td>
-            <td>26.8</td>
-            <td>0.883</td>
-            <td>5.676</td>
-          </tr>
-          <tr>
-            <td><a class="test-link" href="https://huggingface.co/UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3/tree/2.5bpw" target="_blank">2.5bpw</a></td>
-            <td>32.3</td>
-            <td>0.591</td>
-            <td>5.261</td>
-          </tr>
-           <tr>
-            <td><a class="test-link" href="https://huggingface.co/UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3/tree/3.05bpw" target="_blank">3.05bpw</a></td>
-            <td>39.1</td>
-            <td>0.199</td>
-            <td>4.513</td>
-          </tr>
-           <tr>
-            <td><a class="test-link" href="https://huggingface.co/UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3/tree/4.0bpw" target="_blank">4.0bpw</a></td>
-            <td>51</td>
-            <td>0.069</td>
-            <td>4.289</td>
-          </tr>
-           <tr>
-            <td><a class="test-link" href="https://huggingface.co/UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3/tree/5.0bpw" target="_blank">5.0bpw</a></td>
-            <td>63.3</td>
-            <td>0.026</td>
-            <td>4.183</td>
-          </tr>
-           <tr>
-            <td><a class="test-link" href="https://huggingface.co/zerofata/GLM-4.5-Iceblink-v2-106B-A12B/tree/main" target="_blank">bf16</td>
-            <td>205.8</td>
-            <td>0</td>
-            <td>4.132</td>
-          </tr>
-        </tbody>
-      </table>
-    <div style="margin-top: 25px; text-align: center;">
-      <img src="https://huggingface.co/UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3/resolve/main/repoGraph1.svg"
-           alt="EXL3 Quantization Results"
-           style="max-width: 90%; border: 1px solid #333; box-shadow: 0 0 15px #ff00ff;">
     </div>
     </div>
   </div>
-  <div class="test-card">
-    <h2>// DOWNLOAD</h2>
-    <div class="test-content">
-      Use HF-CLI to pull specific branches to your local machine:
-      <div class="cmd-block">huggingface-cli download UnstableLlama/zerofata_GLM-4.5-Iceblink-v2-106B-A12B-exl3 --revision "3.05bpw" --local-dir ./</div>
     </div>
   </div>
-</div>

 ---
 license: mit
 datasets:
 - zerofata/Instruct-Anime
 - zerofata/Roleplay-Anime-Characters
 - zerofata/Instruct-Anime-CreativeWriting
 - zerofata/Summaries-Anime-FandomPages
+base_model:
+- zai-org/GLM-4.5-Air
 ---
 <style>
+.container {
+  --primary-accent: #87CEEB;
+  --secondary-accent: #B0E5F5;
+  --tertiary-accent: #5FA8D3;
+  --ice-accent: #E0F4FF;
+  --silver-accent: #C8D8E4;
+  --glow-primary: rgba(135, 206, 235, 0.6);
+  --glow-secondary: rgba(176, 229, 245, 0.7);
+  --bg-main: #0a1628;
+  --bg-container: #0f1e35;
+  --bg-card: rgba(15, 30, 53, 0.95);
+  --bg-elevated: #162840;
+  --text-main: #E8F4F8;
+  --text-muted: #9BC4E2;
+  --text-bright: #FFFFFF;
+  --white: #FFFFFF;
+  --border-color: #2B4F76;
+  --border-ice: #B0E5F5;
+  --font-title: 'Inter', sans-serif;
+  --font-body: 'Source Sans Pro', sans-serif;
+  --font-code: 'JetBrains Mono', monospace;
+  font-family: var(--font-body);
+  color: var(--text-main);
+  line-height: 1.6;
+  font-weight: 400;
+  max-width: 1100px;
+  margin: 20px auto;
+  padding: 60px;
+  background:
+    linear-gradient(135deg, #0a1628 0%, #0f1e35 50%, #0a1628 100%);
+  min-height: calc(100vh - 40px);
+  position: relative;
+  border: 1px solid var(--border-ice);
+  box-shadow:
+    0 0 0 3px var(--border-color),
+    0 0 0 5px var(--border-ice),
+    0 0 0 8px var(--border-color),
+    0 0 60px rgba(135, 206, 235, 0.4),
+    inset 0 0 100px rgba(135, 206, 235, 0.15);
+}
+.container .title-container {
+  background: linear-gradient(135deg, var(--bg-elevated), var(--bg-card));
+  margin-bottom: 50px;
+  border: 2px solid var(--border-ice);
+  padding: 50px;
+  text-align: center;
+  position: relative;
+  box-shadow:
+    0 0 0 1px var(--border-color),
+    0 0 0 4px var(--border-ice),
+    0 0 0 6px var(--border-color),
+    0 0 40px var(--glow-primary),
+    inset 0 0 60px rgba(135, 206, 235, 0.2);
+  overflow: visible;
+}
+.container .title-container::before {
+  content: '';
+  position: absolute;
+  top: 0;
+  left: 0;
+  right: 0;
+  height: 3px;
+  background: linear-gradient(90deg, transparent, var(--ice-accent), transparent);
+  box-shadow: 0 0 10px var(--ice-accent);
+}
+.container .title-container::after {
+  content: '';
+  position: absolute;
+  bottom: 0;
+  left: 0;
+  right: 0;
+  height: 3px;
+  background: linear-gradient(90deg, transparent, var(--ice-accent), transparent);
+  box-shadow: 0 0 10px var(--ice-accent);
+}
+.container .title-container .title-wrapper {
+  position: relative;
+  z-index: 2;
+}
+.container .title-main {
+  color: var(--text-bright);
+  font-size: 3rem;
+  font-weight: 900;
+  margin: 0;
+  letter-spacing: 4px;
+  display: block;
+  text-transform: uppercase;
+  background: linear-gradient(90deg, var(--ice-accent), var(--text-bright), var(--ice-accent));
+  background-clip: text;
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  font-family: var(--font-title);
+  text-shadow:
+    0 0 30px var(--ice-accent),
+    0 0 60px rgba(176, 229, 245, 0.5),
+    0 4px 8px rgba(135, 206, 235, 0.6);
+  position: relative;
+}
+.container .lemonade-text {
+    background: linear-gradient(135deg, var(--silver-accent), var(--ice-accent));
+    background-clip: text;
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+}
+.container .version-indicator {
+  color: var(--text-muted);
+  font-size: 0.85rem;
+  font-weight: 600;
+  letter-spacing: 3px;
+  margin-top: 15px;
+  text-transform: uppercase;
+  font-family: var(--font-title);
+  opacity: 0.7;
+}
+.container .title-subtitle {
+  padding: 20px;
+  margin-top: 25px;
+  border: 1px solid var(--border-ice);
+  box-shadow: 0 0 20px rgba(135, 206, 235, 0.3);
+}
+.container .subtitle-text {
+  color: var(--text-muted);
+  font-size: 1.3rem;
+  font-family: var(--font-body);
+  font-style: italic;
+  font-weight: 400;
+  letter-spacing: 3px;
+  text-transform: uppercase;
+}
+.container img {
+  max-width: 100%;
+  border: 3px solid var(--border-ice);
+  margin-bottom: 40px;
+  box-shadow:
+    0 0 0 1px var(--border-color),
+    0 0 0 5px var(--border-ice),
+    0 12px 24px rgba(135, 206, 235, 0.3);
+}
+.container .section-container {
+  margin-bottom: 40px;
+  padding: 40px;
+  background: linear-gradient(135deg, var(--bg-card), var(--bg-elevated));
+  border: 2px solid var(--border-ice);
+  box-shadow:
+    0 0 0 1px var(--border-color),
+    0 0 0 5px var(--border-ice),
+    0 8px 24px rgba(135, 206, 235, 0.3),
+    inset 0 0 40px rgba(135, 206, 235, 0.1);
+}
+.container .section-container:last-of-type {
+    margin-bottom: 0;
+}
+.container .section-header {
+  display: flex;
+  align-items: center;
+  padding: 20px;
+  border: 1px solid var(--border-ice);
+  margin-bottom: 30px;
+  background: rgba(43, 79, 118, 0.2);
+  box-shadow: 0 0 20px rgba(135, 206, 235, 0.2);
+}
+.container .section-title {
+  font-family: var(--font-title);
+  background: linear-gradient(90deg, var(--ice-accent), var(--text-bright), var(--ice-accent));
+  background-clip: text;
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  font-size: 1.8rem;
+  margin: 0 !important;
+  padding: 0 !important;
+  letter-spacing: 4px;
+  font-weight: 800;
+  text-transform: uppercase;
+  border: none !important;
+  display: inline-block;
+  text-shadow: 0 0 20px var(--ice-accent);
+}
+.container .section-content {
+    padding: 0;
+}
+.container .subheading {
+  color: var(--text-bright);
+  font-size: 1.4rem;
+  margin-top: 30px;
+  margin-bottom: 20px;
+  font-weight: 700;
+  display: block;
+  text-transform: uppercase;
+  letter-spacing: 3px;
+  font-family: var(--font-title);
+  border-bottom: 2px solid var(--border-ice);
+  padding-bottom: 12px;
+  text-shadow: 0 0 15px var(--ice-accent);
+}
+.container .data-box {
+  background: linear-gradient(135deg, var(--bg-card), rgba(22, 40, 64, 0.8));
+  padding: 25px;
+  border: 2px solid var(--border-ice);
+  border-left: 5px solid var(--primary-accent);
+  margin-bottom: 25px;
+  box-shadow:
+    0 0 20px rgba(135, 206, 235, 0.3),
+    inset 0 0 20px rgba(135, 206, 235, 0.1);
+  font-size: 1rem;
+}
+.container .data-row {
+  display: flex;
+  align-items: center;
+  margin-bottom: 12px;
+  padding: 10px 0;
+  border-bottom: 1px solid rgba(176, 229, 245, 0.2);
+}
+.container .data-row:last-child {
+  margin-bottom: 0;
+}
+.container .data-arrow {
+  color: var(--ice-accent);
+  font-weight: bold;
+  margin-right: 15px;
+  font-family: var(--font-code);
+  font-size: 1.2rem;
+  text-shadow: 0 0 10px var(--ice-accent);
+}
+.container .data-label {
+  color: var(--text-muted);
+  font-weight: 700;
+  font-family: var(--font-body);
+  margin-right: 15px;
+  min-width: 120px;
+  text-transform: uppercase;
+  letter-spacing: 1px;
+}
+.container a {
+  color: var(--text-bright);
+  text-decoration: none;
+  font-weight: 600;
+  transition: all .2s;
+}
+.container .data-row a {
+  border-bottom: 1px dotted var(--ice-accent);
+}
+.container a:hover {
+  color: var(--ice-accent);
+  text-shadow: 0 0 10px var(--ice-accent);
+}
+.container .data-row a:hover {
+  border-bottom-style: solid;
+}
+.container .dropdown-container {
+  margin-top: 30px;
+}
+.container .dropdown-summary {
+  cursor: pointer;
+  padding: 15px 20px;
+  color: var(--text-muted);
+  font-size: 1.2rem;
+  font-weight: 700;
+  text-transform: uppercase;
+  font-family: var(--font-title);
+  letter-spacing: 2px;
+  list-style: none;
+  transition: all 0.2s ease;
+  border: 1px solid var(--border-ice);
+  background: rgba(43, 79, 118, 0.2);
+  box-shadow: 0 0 15px rgba(135, 206, 235, 0.2);
+}
+.container .dropdown-summary:hover {
+    color: var(--ice-accent);
+    background: rgba(43, 79, 118, 0.3);
+    box-shadow: 0 0 25px rgba(135, 206, 235, 0.3);
+}
+.container .dropdown-summary::-webkit-details-marker {
+  display: none;
+}
+.container .dropdown-arrow {
+  color: var(--ice-accent);
+  margin-right: 15px;
+  transition: transform 0.2s ease;
+  text-shadow: 0 0 10px var(--ice-accent);
+}
+.container details[open] .dropdown-arrow {
+  transform: rotate(90deg);
+}
+.container .dropdown-content {
+  margin-top: 20px;
+  padding: 20px 15px;
+  background: linear-gradient(135deg, var(--bg-card), rgba(22, 40, 64, 0.9));
+  border: 2px solid var(--border-ice);
+  box-shadow:
+    0 0 20px rgba(135, 206, 235, 0.3),
+    inset 0 0 30px rgba(135, 206, 235, 0.1);
+}
+.container .config-title {
+  color: var(--text-bright);
+  font-size: 1.1rem;
+  margin-bottom: 10px;
+  font-family: var(--font-body);
+  text-transform: uppercase;
+  letter-spacing: 2px;
+  font-weight: 700;
+  text-shadow: 0 0 10px var(--ice-accent);
+}
+.container pre {
+  background: #050a14;
+  padding: 8px 16px;
+  margin: 0;
+  border: 2px solid var(--border-ice);
+  white-space: pre;
+  overflow-x: auto;
+  color: var(--text-main);
+  box-shadow:
+    0 0 20px rgba(135, 206, 235, 0.2),
+    inset 0 0 20px rgba(135, 206, 235, 0.15);
+}
+.container pre code {
+  background: none;
+  color: inherit;
+  padding: 0;
+  margin: 0;
+  display: block;
+  border: none;
+  outline: none;
+}
+.container code {
+  font-family: var(--font-code);
+  color: var(--ice-accent);
+  background: rgba(176, 229, 245, 0.15);
+  padding: 4px 8px;
+  border: 1px solid rgba(135, 206, 235, 0.3);
+}
 </style>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Iceblink</title>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700;800&family=Source+Sans+Pro:ital,wght@0,400;0,600;1,400&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
+</head>
+<body>
+<div class="container">
+  <div class="title-container">
+    <div class="glitchy-overlay"></div>
+    <div class="title-wrapper">
+      <h1 class="title-main">
+        <span class="title-prefix">ICEBLINK</span>
+      </h1>
+      <div class="version-indicator">VERSION 2</div>
+    </div>
+  </div>
+![image](https://cdn-uploads.huggingface.co/production/uploads/65b19c6c638328850e12d38c/zA2QsjdJNOC-62-bZiOun.png)
+  <div class="section-container">
+    <div class="section-header">
+      <div class="section-indicator"></div>
+      <h2 class="section-title">Overview</h2>
+    </div>
+    <div class="section-content">
+      <p></p>
+      <p>Another re-attempt at GLM 4.5 Air. This time using a different training framework, some updated data and better hyperparameters.</p>
+      <p>This model is a creative writing and RP model. It's pretty verbose. The intent is to keep the behavior of the original model, but to improve writing, dialogue & creativity.</p>
+      <p>Compared to the original Iceblink, the effect on this one is more pronounced, with hopefully minimal impact on the intelligence.</p>
     </div>
   </div>
+  <div class="section-container">
+    <div class="section-header">
+      <div class="section-indicator"></div>
+      <h2 class="section-title">SillyTavern Settings</h2>
+    </div>
+    <div class="section-content">
+      <h3 class="subheading">Recommended Roleplay Format</h3>
+      <div class="data-box">
+        <div class="data-row">
+            <span class="data-arrow">></span>
+            <span class="data-label">Actions:</span>
+            <span>In plaintext</span>
+        </div>
+      <div class="data-row">
+            <span class="data-arrow">></span>
+            <span class="data-label">Dialogue:</span>
+            <span>"In quotes"</span>
+      </div>
+      <div class="data-row">
+            <span class="data-arrow">></span>
+            <span class="data-label">Thoughts:</span>
+            <span>*In asterisks*</span>
+      </div>
+      </div>
+      <h3 class="subheading">Recommended Samplers</h3>
+      <div class="data-box">
+        <div class="data-row">
+          <span class="data-arrow">></span>
+          <span class="data-label">Temp:</span>
+          <span>0.8 - 0.9</span>
+        </div>
+        <div class="data-row">
+          <span class="data-arrow">></span>
+          <span class="data-label">MinP:</span>
+          <span>0.05</span>
+        </div>
+        <div class="data-row">
+          <span class="data-arrow">></span>
+          <span class="data-label">TopP:</span>
+          <span>0.95 - 1.00</span>
+        </div>
+      </div>
+      <h3 class="subheading">Instruct</h3>
+      <div class="data-box">
+        <p style="margin: 0;">GLM4.5 (no thinking): <a href="https://huggingface.co/zerofata/GLM-4.5-Iceblink-106B-A12B/raw/main/GLM45-NoThink-SillyTavern-Preset.json">SillyTavern Preset</a></p>
+      </div>
     </div>
   </div>
+  <div class="section-container">
+    <div class="section-header">
+      <div class="section-indicator"></div>
+      <h2 class="section-title">Quantizations</h2>
     </div>
+    <div class="section-content">
+      <div style="margin-bottom: 20px;">
+        <h3 class="subheading">GGUF</h3>
+        <div class="data-box">
+          <div class="data-row">
+            <span class="data-arrow">></span>
+            <a href="https://huggingface.co/ddh0/Iceblink-v3-SFT-3-GGUF">iMatrix (ddh0)</a>
+          </div>
+        </div>
+      </div>
     </div>
   </div>
+  <div class="section-container">
+    <div class="section-header">
+      <div class="section-indicator"></div>
+      <h2 class="section-title">Creation Process</h2>
+    </div>
+    <div class="section-content">
+      <p>Creation Process: SFT</p>
+      <p>SFT on approx 13 million tokens, SFW / NSFW RP, stories, creative instruct & chat data. Some of the SFW datasets are public and can be found in the model datasets list.</p>
+      <p>I've switched over from Axolotl to MS-Swift w/ Megatron to train MoE models now. There's a roughly 5-10x speedup in training the models, thanks to escaping the naive MoE implementation in TRL. The training time for this run took only 40 minutes, excluding environment setup time.</p>
+      <p>A low LR for GLM Air appears to be king. Going any higher, I've found it extremely easy to begin overcooking the model.</p>
+        <div class="dropdown-container">
+          <details>
+            <summary class="dropdown-summary">
+              <span class="dropdown-arrow">></span>
+              MS-Swift config
+            </summary>
+          <div class="dropdown-content">
+            <p>Not optimized for cost / performance efficiency, YMMV.</p>
+            <div class="config-title">SFT (8*H200)</div>
+            <pre><code>PYTORCH_CUDA_ALLOC_CONF='expandable_segments:True' \
+NPROC_PER_NODE=8 \
+WANDB_API_KEY=wandb_key \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+megatron sft \
+    --load '/workspace/glm-4.5-air-mcore' \
+    --dataset '/workspace/joined_dataset_cleaned_modified.jsonl' \
+    --load_from_cache_file true \
+    --train_type lora \
+    --lora_rank 256 \
+    --lora_alpha 16 \
+    --use-rslora true \
+    --target_modules all-linear \
+    --split_dataset_ratio 0.01 \
+    --moe_permute_fusion true \
+    --tensor_model_parallel_size 8 \
+    --expert_tensor_parallel_size 1 \
+    --expert_model_parallel_size 8 \
+    --moe_grouped_gemm true \
+    --moe_shared_expert_overlap true \
+    --moe_aux_loss_coeff 6e-5 \
+    --micro_batch_size 4 \
+    --global_batch_size 32 \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --max_epochs 2 \
+    --cross_entropy_loss_fusion true \
+    --lr 6e-6 \
+    --lr_warmup_fraction 0.05 \
+    --min_lr 6e-7 \
+    --save megatron_output/Iceblink-v3-SFT-3 \
+    --eval_interval 20 \
+    --save_interval 25 \
+    --finetune true \
+    --packing true \
+    --max_length 10280 \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --wandb_project Megatron-Air-SFT \
+    --wandb_exp_name Iceblink-v3-SFT-3 \
+    --attention_backend flash</code></pre>
+          </div>
+        </details>
+      </div>
     </div>
   </div>
+  <div class="section-container">
+    <div class="section-header">
+      <div class="section-indicator"></div>
+      <h2 class="section-title">Special Thanks</h2>
+    </div>
+    <div class="section-content">
+      <p>A shoutout to the people in BeaverAI discord that helped me test this model and my intermediate versions.</p>
+      <p>ddh0 (Madison), Ambius, Dysfunctional & my dude.</p>
+    </div>
+  </div>
+</div>
+</body>
+</html>

args.json ADDED Viewed

	@@ -0,0 +1,467 @@

+{
+  "use_ray": false,
+  "ray_exp_name": null,
+  "device_groups": null,
+  "model": "zai-org/GLM-4.5-Air",
+  "model_type": "glm4_5",
+  "model_revision": null,
+  "task_type": "causal_lm",
+  "torch_dtype": "bfloat16",
+  "attn_impl": null,
+  "new_special_tokens": [],
+  "num_labels": null,
+  "problem_type": null,
+  "rope_scaling": null,
+  "device_map": null,
+  "max_memory": {},
+  "max_model_len": null,
+  "local_repo_path": null,
+  "init_strategy": null,
+  "template": "glm4_5",
+  "system": null,
+  "max_length": 10280,
+  "truncation_strategy": "delete",
+  "max_pixels": null,
+  "agent_template": null,
+  "norm_bbox": null,
+  "use_chat_template": true,
+  "padding_free": true,
+  "padding_side": "right",
+  "loss_scale": "default",
+  "sequence_parallel_size": 1,
+  "response_prefix": null,
+  "template_backend": "swift",
+  "dataset": [
+    "/workspace/joined_dataset_cleaned_modified.jsonl"
+  ],
+  "val_dataset": [],
+  "split_dataset_ratio": 0.01,
+  "data_seed": 42,
+  "dataset_num_proc": 8,
+  "load_from_cache_file": true,
+  "dataset_shuffle": true,
+  "val_dataset_shuffle": false,
+  "streaming": false,
+  "interleave_prob": null,
+  "stopping_strategy": "first_exhausted",
+  "shuffle_buffer_size": 1000,
+  "download_mode": "reuse_dataset_if_exists",
+  "columns": {},
+  "strict": false,
+  "remove_unused_columns": true,
+  "model_name": null,
+  "model_author": null,
+  "custom_dataset_info": [],
+  "quant_method": null,
+  "quant_bits": null,
+  "hqq_axis": null,
+  "bnb_4bit_compute_dtype": "bfloat16",
+  "bnb_4bit_quant_type": "nf4",
+  "bnb_4bit_use_double_quant": true,
+  "bnb_4bit_quant_storage": null,
+  "max_new_tokens": null,
+  "temperature": null,
+  "top_k": null,
+  "top_p": null,
+  "repetition_penalty": null,
+  "num_beams": 1,
+  "stream": false,
+  "stop_words": [],
+  "logprobs": false,
+  "top_logprobs": null,
+  "ckpt_dir": "/workspace/glm-4.5-air-mcore",
+  "lora_modules": [],
+  "tuner_backend": "peft",
+  "train_type": "lora",
+  "adapters": [],
+  "external_plugins": [],
+  "seed": 42,
+  "model_kwargs": {},
+  "load_args": false,
+  "load_data_args": false,
+  "packing": true,
+  "packing_length": 10280,
+  "lazy_tokenize": false,
+  "cached_dataset": [],
+  "custom_register_path": [],
+  "use_hf": false,
+  "hub_token": null,
+  "ddp_timeout": 18000000,
+  "ddp_backend": null,
+  "ignore_args_error": false,
+  "use_swift_lora": false,
+  "freeze_llm": false,
+  "freeze_vit": true,
+  "freeze_aligner": true,
+  "freeze_parameters": [],
+  "freeze_parameters_regex": null,
+  "freeze_parameters_ratio": 0.0,
+  "trainable_parameters": [],
+  "trainable_parameters_regex": null,
+  "adapter_load": null,
+  "target_modules": [
+    "all-linear"
+  ],
+  "target_regex": null,
+  "modules_to_save": [],
+  "lora_rank": 256,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "lora_bias": "none",
+  "lora_dtype": null,
+  "use_rslora": true,
+  "rlhf_type": null,
+  "ref_load": null,
+  "ref_adapter_load": null,
+  "beta": 0.1,
+  "rpo_alpha": null,
+  "reference_free": false,
+  "label_smoothing": 0.0,
+  "f_divergence_type": "reverse_kl",
+  "loss_type": null,
+  "desirable_weight": 1.0,
+  "undesirable_weight": 1.0,
+  "calculate_KL": null,
+  "center_rewards_coefficient": null,
+  "padded_vocab_size": 151552,
+  "initialize_embedding": false,
+  "mlp_padding_free": false,
+  "dataloader_persistent_workers": true,
+  "dataloader_prefetch_factor": 10,
+  "architectures": "Glm4MoeForCausalLM",
+  "llm_architectures": null,
+  "max_epochs": 2,
+  "enable_dft_loss": false,
+  "enable_channel_loss": false,
+  "original_max_position_embeddings": null,
+  "partial_rotary_factor": 0.5,
+  "use_shared_expert_gate": false,
+  "vit_gradient_checkpointing": true,
+  "gradient_checkpointing_kwargs": null,
+  "linear_num_value_heads": null,
+  "linear_num_key_heads": null,
+  "linear_key_head_dim": null,
+  "linear_value_head_dim": null,
+  "linear_conv_kernel_dim": null,
+  "layer_types": null,
+  "mrope_interleaved": false,
+  "micro_batch_size": 4,
+  "global_batch_size": 32,
+  "recompute_granularity": "full",
+  "recompute_method": "uniform",
+  "recompute_num_layers": 1,
+  "recompute_modules": [
+    "core_attn"
+  ],
+  "use_cpu_initialization": false,
+  "deterministic_mode": false,
+  "train_iters": null,
+  "log_interval": 5,
+  "tensorboard_dir": "/workspace/megatron_output/Iceblink-v3-SFT-3/v0-20251101-214719/runs",
+  "no_masked_softmax_fusion": false,
+  "no_bias_dropout_fusion": false,
+  "no_bias_swiglu_fusion": false,
+  "no_rope_fusion": false,
+  "no_gradient_accumulation_fusion": false,
+  "cross_entropy_loss_fusion": true,
+  "cross_entropy_fusion_impl": "native",
+  "calculate_per_token_loss": true,
+  "use_flash_attn": false,
+  "attention_backend": "flash",
+  "optimizer": "adam",
+  "optimizer_cpu_offload": false,
+  "optimizer_offload_fraction": 1.0,
+  "use_precision_aware_optimizer": false,
+  "main_grads_dtype": "fp32",
+  "main_params_dtype": "fp32",
+  "exp_avg_dtype": "fp32",
+  "exp_avg_sq_dtype": "fp32",
+  "dataloader_type": "cyclic",
+  "manual_gc": false,
+  "manual_gc_interval": 0,
+  "lr": 6e-06,
+  "lr_decay_style": "cosine",
+  "lr_decay_iters": null,
+  "lr_warmup_iters": 0,
+  "lr_warmup_fraction": 0.05,
+  "min_lr": 6e-07,
+  "weight_decay": 0.1,
+  "clip_grad": 1.0,
+  "adam_beta1": 0.9,
+  "adam_beta2": 0.95,
+  "adam_eps": 1e-08,
+  "sgd_momentum": 0.9,
+  "save": "/workspace/megatron_output/Iceblink-v3-SFT-3/v0-20251101-214719",
+  "save_interval": 25,
+  "save_retain_interval": null,
+  "no_save_optim": true,
+  "no_save_rng": true,
+  "load": "/workspace/glm-4.5-air-mcore",
+  "no_load_optim": false,
+  "no_load_rng": false,
+  "finetune": true,
+  "ckpt_format": "torch_dist",
+  "no_initialization": true,
+  "auto_detect_ckpt_format": true,
+  "exit_on_missing_checkpoint": true,
+  "async_save": false,
+  "use_persistent_ckpt_worker": false,
+  "ckpt_fully_parallel_load": false,
+  "ckpt_assume_constant_structure": false,
+  "distributed_backend": "nccl",
+  "local_rank": 0,
+  "use_distributed_optimizer": true,
+  "tensor_model_parallel_size": 8,
+  "pipeline_model_parallel_size": 1,
+  "decoder_first_pipeline_num_layers": null,
+  "decoder_last_pipeline_num_layers": null,
+  "sequence_parallel": true,
+  "context_parallel_size": 1,
+  "tp_comm_overlap": false,
+  "overlap_grad_reduce": false,
+  "overlap_param_gather": false,
+  "distributed_timeout_minutes": 300000,
+  "num_layers_per_virtual_pipeline_stage": null,
+  "num_virtual_stages_per_pipeline_rank": null,
+  "microbatch_group_size_per_virtual_pipeline_stage": null,
+  "pipeline_model_parallel_layout": null,
+  "num_layers": 46,
+  "hidden_size": 4096,
+  "ffn_hidden_size": 10944,
+  "num_attention_heads": 96,
+  "group_query_attention": true,
+  "num_query_groups": 8,
+  "max_position_embeddings": 131072,
+  "position_embedding_type": "rope",
+  "mrope_section": null,
+  "rotary_base": 1000000,
+  "rotary_percent": 1.0,
+  "rotary_interleaved": false,
+  "normalization": "RMSNorm",
+  "norm_epsilon": 1e-05,
+  "swiglu": true,
+  "untie_embeddings_and_output_weights": true,
+  "disable_bias_linear": true,
+  "add_qkv_bias": true,
+  "attention_dropout": 0.0,
+  "hidden_dropout": 0.0,
+  "kv_channels": 128,
+  "qk_layernorm": false,
+  "transformer_impl": "transformer_engine",
+  "num_experts": 128,
+  "moe_layer_freq": "[0]*1+[1]*45",
+  "moe_ffn_hidden_size": 1408,
+  "moe_shared_expert_intermediate_size": 1408,
+  "moe_router_topk": 8,
+  "moe_router_pre_softmax": false,
+  "moe_router_dtype": "fp32",
+  "moe_router_score_function": "sigmoid",
+  "moe_router_bias_update_rate": 0.001,
+  "moe_router_enable_expert_bias": true,
+  "moe_router_topk_scaling_factor": 1.0,
+  "moe_router_load_balancing_type": "aux_loss",
+  "expert_model_parallel_size": 8,
+  "expert_tensor_parallel_size": 1,
+  "moe_token_dispatcher_type": null,
+  "moe_enable_deepep": false,
+  "moe_grouped_gemm": true,
+  "moe_permute_fusion": true,
+  "moe_aux_loss_coeff": 6e-05,
+  "moe_z_loss_coeff": null,
+  "moe_shared_expert_overlap": true,
+  "moe_layer_recompute": false,
+  "moe_expert_capacity_factor": null,
+  "moe_pad_expert_input_to_capacity": false,
+  "moe_token_drop_policy": null,
+  "multi_latent_attention": false,
+  "q_lora_rank": null,
+  "kv_lora_rank": 32,
+  "qk_head_dim": 128,
+  "qk_pos_emb_head_dim": 64,
+  "fp8_format": null,
+  "fp8_recipe": "delayed",
+  "fp8_amax_history_len": 1024,
+  "fp8_amax_compute_algo": "max",
+  "fp8_param_gather": false,
+  "fp16": false,
+  "bf16": true,
+  "apply_query_key_layer_scaling": false,
+  "attention_softmax_in_fp32": true,
+  "log_params_norm": false,
+  "log_throughput": false,
+  "tensorboard_log_interval": 1,
+  "tensorboard_queue_size": 50,
+  "log_timers_to_tensorboard": true,
+  "no_log_learning_rate_to_tensorboard": false,
+  "log_validation_ppl_to_tensorboard": true,
+  "log_memory_to_tensorboard": true,
+  "logging_level": null,
+  "wandb_project": "Megatron-Air-SFT",
+  "wandb_exp_name": "Iceblink-v3-SFT-3",
+  "wandb_save_dir": null,
+  "eval_iters": -1,
+  "eval_interval": 20,
+  "seq_length": 10280,
+  "num_workers": 8,
+  "megatron_extra_kwargs": {},
+  "add_version": true,
+  "rank": 0,
+  "global_world_size": 8,
+  "local_world_size": 8,
+  "model_suffix": "GLM-4.5-Air",
+  "model_info": "ModelInfo(model_type='glm4_5', model_dir='/root/.cache/modelscope/hub/models/ZhipuAI/GLM-4___5-Air', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=True, config=None, task_type='causal_lm', num_labels=None)",
+  "model_meta": "ModelMeta(model_type='glm4_5', model_groups=[ModelGroup(models=[Model(ms_model_id='ZhipuAI/GLM-4.5-Air-Base', hf_model_id='zai-org/GLM-4.5-Air-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-Air', hf_model_id='zai-org/GLM-4.5-Air', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-Air-FP8', hf_model_id='zai-org/GLM-4.5-Air-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-Base', hf_model_id='zai-org/GLM-4.5-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5', hf_model_id='zai-org/GLM-4.5', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-FP8', hf_model_id='zai-org/GLM-4.5-FP8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='ZhipuAI/GLM-4.6', hf_model_id='zai-org/GLM-4.6', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='glm4_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x76c9ab052520>, model_arch=None, architectures=['Glm4MoeForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.54'], tags=[])",
+  "model_dir": "/root/.cache/modelscope/hub/models/ZhipuAI/GLM-4___5-Air",
+  "hub": "<class 'swift.hub.hub.MSHub'>",
+  "megatron_model_meta": "MegatronModelMeta(megatron_model_type='gpt', model_types=['qwen2', 'qwen2_5', 'qwq', 'qwq_preview', 'qwen2_5_math', 'llama', 'llama3', 'llama3_1', 'llama3_2', 'longwriter_llama3_1', 'codefuse_codellama', 'marco_o1', 'deepseek', 'deepseek_r1_distill', 'yi', 'yi_coder', 'sus', 'skywork_o1', 'openbuddy_llama', 'openbuddy_llama3', 'megrez', 'reflection', 'numina', 'ziya', 'mengzi3', 'qwen3', 'qwen3_thinking', 'qwen3_nothinking', 'qwen2_moe', 'qwen3_moe', 'qwen3_moe_thinking', 'qwen3_coder', 'internlm3', 'mimo', 'mimo_rl', 'moonlight', 'deepseek_moe', 'deepseek_v2', 'deepseek_v2_5', 'deepseek_r1', 'dots1', 'ernie', 'glm4_5', 'deepseek_v3_1', 'ernie_thinking'], convert_mcore2hf=<function convert_mcore2hf at 0x76c91dcb0fe0>, convert_hf2mcore=<function convert_hf2mcore at 0x76c91dcb0b80>, model_cls=<class 'swift.megatron.model.gpt_model.GPTModel'>, convert_hf_config=<function convert_gpt_hf_config at 0x76c91de6c9a0>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x76c91dddb880>, visual_cls=None, extra_args_provider=None)",
+  "extra_args": {
+    "use_ray": false,
+    "ray_exp_name": null,
+    "device_groups": null,
+    "model": "ZhipuAI/GLM-4.5-Air",
+    "model_type": "glm4_5",
+    "model_revision": null,
+    "task_type": "causal_lm",
+    "torch_dtype": "bfloat16",
+    "attn_impl": null,
+    "new_special_tokens": [],
+    "num_labels": null,
+    "problem_type": null,
+    "rope_scaling": null,
+    "device_map": null,
+    "max_memory": {},
+    "max_model_len": null,
+    "local_repo_path": null,
+    "init_strategy": null,
+    "template": "glm4_5",
+    "system": null,
+    "max_length": 10280,
+    "truncation_strategy": "delete",
+    "max_pixels": null,
+    "agent_template": null,
+    "norm_bbox": null,
+    "use_chat_template": true,
+    "padding_free": true,
+    "padding_side": "right",
+    "sequence_parallel_size": 1,
+    "response_prefix": null,
+    "template_backend": "swift",
+    "dataset": [
+      "/workspace/joined_dataset_cleaned_modified.jsonl"
+    ],
+    "val_dataset": [],
+    "split_dataset_ratio": 0.01,
+    "data_seed": 42,
+    "dataset_num_proc": 8,
+    "load_from_cache_file": true,
+    "dataset_shuffle": true,
+    "val_dataset_shuffle": false,
+    "streaming": false,
+    "interleave_prob": null,
+    "stopping_strategy": "first_exhausted",
+    "shuffle_buffer_size": 1000,
+    "download_mode": "reuse_dataset_if_exists",
+    "columns": {},
+    "strict": false,
+    "remove_unused_columns": true,
+    "model_name": null,
+    "model_author": null,
+    "custom_dataset_info": [],
+    "quant_method": null,
+    "quant_bits": null,
+    "hqq_axis": null,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "bnb_4bit_quant_storage": null,
+    "max_new_tokens": null,
+    "temperature": null,
+    "top_k": null,
+    "top_p": null,
+    "repetition_penalty": null,
+    "num_beams": 1,
+    "stream": false,
+    "stop_words": [],
+    "logprobs": false,
+    "top_logprobs": null,
+    "ckpt_dir": "/workspace/glm-4.5-air-mcore",
+    "lora_modules": [],
+    "tuner_backend": "peft",
+    "train_type": "lora",
+    "adapters": [],
+    "external_plugins": [],
+    "model_kwargs": {},
+    "load_args": false,
+    "load_data_args": false,
+    "packing": true,
+    "packing_length": 10280,
+    "lazy_tokenize": false,
+    "cached_dataset": [],
+    "custom_register_path": [],
+    "use_hf": false,
+    "hub_token": null,
+    "ddp_timeout": 18000000,
+    "ddp_backend": null,
+    "ignore_args_error": false,
+    "use_swift_lora": false,
+    "freeze_llm": false,
+    "freeze_vit": true,
+    "freeze_aligner": true,
+    "freeze_parameters": [],
+    "freeze_parameters_regex": null,
+    "freeze_parameters_ratio": 0.0,
+    "trainable_parameters": [],
+    "trainable_parameters_regex": null,
+    "adapter_load": null,
+    "target_modules": [
+      "all-linear"
+    ],
+    "target_regex": null,
+    "modules_to_save": [],
+    "lora_rank": 256,
+    "lora_alpha": 16,
+    "lora_dropout": 0.05,
+    "lora_bias": "none",
+    "lora_dtype": null,
+    "use_rslora": true,
+    "rlhf_type": null,
+    "ref_load": null,
+    "ref_adapter_load": null,
+    "beta": 0.1,
+    "rpo_alpha": null,
+    "reference_free": false,
+    "label_smoothing": 0.0,
+    "f_divergence_type": "reverse_kl",
+    "loss_type": null,
+    "desirable_weight": 1.0,
+    "undesirable_weight": 1.0,
+    "calculate_KL": null,
+    "center_rewards_coefficient": null,
+    "padded_vocab_size": 151552,
+    "initialize_embedding": false,
+    "mlp_padding_free": false,
+    "dataloader_persistent_workers": true,
+    "dataloader_prefetch_factor": 10,
+    "architectures": "Glm4MoeForCausalLM",
+    "llm_architectures": null,
+    "max_epochs": 2,
+    "enable_dft_loss": false,
+    "enable_channel_loss": false,
+    "original_max_position_embeddings": null,
+    "partial_rotary_factor": 0.5,
+    "use_shared_expert_gate": false,
+    "vit_gradient_checkpointing": true,
+    "gradient_checkpointing_kwargs": null,
+    "linear_num_value_heads": null,
+    "linear_num_key_heads": null,
+    "linear_key_head_dim": null,
+    "linear_value_head_dim": null,
+    "linear_conv_kernel_dim": null,
+    "layer_types": null,
+    "mrope_interleaved": false,
+    "add_version": true,
+    "model_info": "ModelInfo(model_type='glm4_5', model_dir='/root/.cache/modelscope/hub/models/ZhipuAI/GLM-4___5-Air', torch_dtype=torch.bfloat16, max_model_len=131072, quant_method=None, quant_bits=None, rope_scaling=None, is_moe_model=True, config=None, task_type='causal_lm', num_labels=None)",
+    "model_meta": "ModelMeta(model_type='glm4_5', model_groups=[ModelGroup(models=[Model(ms_model_id='ZhipuAI/GLM-4.5-Air-Base', hf_model_id='zai-org/GLM-4.5-Air-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-Air', hf_model_id='zai-org/GLM-4.5-Air', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-Air-FP8', hf_model_id='zai-org/GLM-4.5-Air-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-Base', hf_model_id='zai-org/GLM-4.5-Base', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5', hf_model_id='zai-org/GLM-4.5', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='ZhipuAI/GLM-4.5-FP8', hf_model_id='zai-org/GLM-4.5-FP8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[]), ModelGroup(models=[Model(ms_model_id='ZhipuAI/GLM-4.6', hf_model_id='zai-org/GLM-4.6', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='glm4_5', get_function=<function get_model_tokenizer_with_flash_attn at 0x76c9ab052520>, model_arch=None, architectures=['Glm4MoeForCausalLM'], additional_saved_files=[], torch_dtype=None, is_multimodal=False, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.54'], tags=[])",
+    "megatron_model_meta": "MegatronModelMeta(megatron_model_type='gpt', model_types=['qwen2', 'qwen2_5', 'qwq', 'qwq_preview', 'qwen2_5_math', 'llama', 'llama3', 'llama3_1', 'llama3_2', 'longwriter_llama3_1', 'codefuse_codellama', 'marco_o1', 'deepseek', 'deepseek_r1_distill', 'yi', 'yi_coder', 'sus', 'skywork_o1', 'openbuddy_llama', 'openbuddy_llama3', 'megrez', 'reflection', 'numina', 'ziya', 'mengzi3', 'qwen3', 'qwen3_thinking', 'qwen3_nothinking', 'qwen2_moe', 'qwen3_moe', 'qwen3_moe_thinking', 'qwen3_coder', 'internlm3', 'mimo', 'mimo_rl', 'moonlight', 'deepseek_moe', 'deepseek_v2', 'deepseek_v2_5', 'deepseek_r1', 'dots1', 'ernie', 'glm4_5', 'deepseek_v3_1', 'ernie_thinking'], convert_mcore2hf=<function convert_mcore2hf at 0x76c91dcb0fe0>, convert_hf2mcore=<function convert_hf2mcore at 0x76c91dcb0b80>, model_cls=<class 'swift.megatron.model.gpt_model.GPTModel'>, convert_hf_config=<function convert_gpt_hf_config at 0x76c91de6c9a0>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x76c91dddb880>, visual_cls=None, extra_args_provider=None)"
+  }
+}

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,103 @@

+[gMASK]<sop>
+{%- if tools -%}
+<|system|>
+# Tools
+You may call one or more functions to assist with the user query.
+You are provided with function signatures within <tools></tools> XML tags:
+<tools>
+{% for tool in tools %}
+{{ tool | tojson(ensure_ascii=False) }}
+{% endfor %}
+</tools>
+For each function call, output the function name and arguments within the following XML format:
+<tool_call>{function-name}
+<arg_key>{arg-key-1}</arg_key>
+<arg_value>{arg-value-1}</arg_value>
+<arg_key>{arg-key-2}</arg_key>
+<arg_value>{arg-value-2}</arg_value>
+...
+</tool_call>{%- endif -%}
+{%- macro visible_text(content) -%}
+    {%- if content is string -%}
+        {{- content }}
+    {%- elif content is iterable and content is not mapping -%}
+        {%- for item in content -%}
+            {%- if item is mapping and item.type == 'text' -%}
+                {{- item.text }}
+            {%- elif item is string -%}
+                {{- item }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- else -%}
+        {{- content }}
+    {%- endif -%}
+{%- endmacro -%}
+{%- set ns = namespace(last_user_index=-1) %}
+{%- for m in messages %}
+    {%- if m.role == 'user' %}
+        {% set ns.last_user_index = loop.index0 -%}
+    {%- endif %}
+{%- endfor %}
+{% for m in messages %}
+{%- if m.role == 'user' -%}<|user|>
+{{ visible_text(m.content) }}
+{{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
+{%- elif m.role == 'assistant' -%}
+<|assistant|>
+{%- set reasoning_content = '' %}
+{%- set content = visible_text(m.content) %}
+{%- if m.reasoning_content is string %}
+    {%- set reasoning_content = m.reasoning_content %}
+{%- else %}
+    {%- if '</think>' in content %}
+        {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+        {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+    {%- endif %}
+{%- endif %}
+{%- if loop.index0 > ns.last_user_index and reasoning_content -%}
+{{ '\n<think>' + reasoning_content.strip() +  '</think>'}}
+{%- else -%}
+{{ '\n<think></think>' }}
+{%- endif -%}
+{%- if content.strip() -%}
+{{ '\n' + content.strip() }}
+{%- endif -%}
+{% if m.tool_calls %}
+{% for tc in m.tool_calls %}
+{%- if tc.function %}
+    {%- set tc = tc.function %}
+{%- endif %}
+{{ '\n<tool_call>' + tc.name }}
+{% set _args = tc.arguments %}
+{% for k, v in _args.items() %}
+<arg_key>{{ k }}</arg_key>
+<arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
+{% endfor %}
+</tool_call>{% endfor %}
+{% endif %}
+{%- elif m.role == 'tool' -%}
+{%- if m.content is string -%}
+{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+    {{- '<|observation|>' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{{- m.content }}
+{{- '\n</tool_response>' }}
+{%- else -%}
+<|observation|>{% for tr in m.content %}
+<tool_response>
+{{ tr.output if tr.output is defined else tr }}
+</tool_response>{% endfor -%}
+{% endif -%}
+{%- elif m.role == 'system' -%}
+<|system|>
+{{ visible_text(m.content) }}
+{%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    <|assistant|>{{- '\n<think></think>' if (enable_thinking is defined and not enable_thinking) else '' -}}
+{%- endif -%}

config.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+    "architectures": [
+        "Glm4MoeForCausalLM"
+    ],
+    "attention_bias": true,
+    "attention_dropout": 0.0,
+    "dtype": "bfloat16",
+    "eos_token_id": [
+        151329,
+        151336,
+        151338
+    ],
+    "first_k_dense_replace": 1,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 10944,
+    "max_position_embeddings": 131072,
+    "model_type": "glm4_moe",
+    "moe_intermediate_size": 1408,
+    "n_group": 1,
+    "n_routed_experts": 128,
+    "n_shared_experts": 1,
+    "norm_topk_prob": true,
+    "num_attention_heads": 96,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 46,
+    "num_key_value_heads": 8,
+    "num_nextn_predict_layers": 1,
+    "pad_token_id": 151329,
+    "partial_rotary_factor": 0.5,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": null,
+    "rope_theta": 1000000,
+    "routed_scaling_factor": 1.0,
+    "tie_word_embeddings": false,
+    "topk_group": 1,
+    "transformers_version": "4.57.1",
+    "use_cache": true,
+    "use_qk_norm": false,
+    "vocab_size": 151552,
+    "quantization_config": {
+        "quant_method": "exl3",
+        "version": "0.0.12",
+        "bits": 4.0,
+        "head_bits": 6,
+        "calibration": {
+            "rows": 250,
+            "cols": 2048
+        },
+        "out_scales": "auto",
+        "codebook": "mcg"
+    }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "_from_model_config": true,
+  "eos_token_id": [
+    151329,
+    151336,
+    151338
+  ],
+  "pad_token_id": 151329,
+  "transformers_version": "4.57.1"
+}

model-00001-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e30a37727a5dc40d464809aab2e6ecdb082965dcd86a45c128a1428c1b5797a
+size 8420456484

model-00002-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94e39e5f7caa47819de520dd01988e38484a94e6c3d3e103703f0afaec81b307
+size 8232631620

model-00003-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41e8f12c5f6f1b65b3a58d6fb420b573ee8dc19768e6e6ab35fe7f7e0d50f0ec
+size 8232636332

model-00004-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f5ae04114158edac4649ac873fc39981db9602084f13af6c7986d1f4fbb1d6c2
+size 8232636332

model-00005-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e106425960171f5a5d1da1fa179514415d2004369a4b160286bc05d567268685
+size 8232636332

model-00006-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fec42ca6517b8787c5a599893bdc699346a93e2e4e61fdaf88b61b01009976d0
+size 8232636332

model-00007-of-00007.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a71cf142e438e1e8284ba4a6790eaaff7397e9840cdbceb3c3a5c6a7bf2acbca
+size 5170249436

model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

quantization_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e456e629ce44b1c4ec718089ad334f641e9c6b50c068d11974e95747bc1c371b
+size 21625412

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>",
+    "<|begin_of_audio|>",
+    "<|end_of_audio|>",
+    "<|begin_of_transcription|>",
+    "<|end_of_transcription|>",
+    "<|code_prefix|>",
+    "<|code_middle|>",
+    "<|code_suffix|>",
+    "/nothink"
+  ],
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bda8e2146c3bb7b7e0fc96dcc4f0aeff041c6c27952e3ace0665663ebff346ba
+size 19970700

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,325 @@

+{
+  "added_tokens_decoder": {
+    "151329": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151330": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151331": {
+      "content": "[gMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151332": {
+      "content": "[sMASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151333": {
+      "content": "<sop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151334": {
+      "content": "<eop>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151335": {
+      "content": "<|system|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151336": {
+      "content": "<|user|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151337": {
+      "content": "<|assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151338": {
+      "content": "<|observation|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151339": {
+      "content": "<|begin_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151340": {
+      "content": "<|end_of_image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151341": {
+      "content": "<|begin_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151342": {
+      "content": "<|end_of_video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151343": {
+      "content": "<|begin_of_audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151344": {
+      "content": "<|end_of_audio|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151345": {
+      "content": "<|begin_of_transcription|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151346": {
+      "content": "<|end_of_transcription|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151347": {
+      "content": "<|code_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151348": {
+      "content": "<|code_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151349": {
+      "content": "<|code_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151350": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151351": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151352": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151353": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151354": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151355": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151356": {
+      "content": "<arg_key>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151357": {
+      "content": "</arg_key>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151358": {
+      "content": "<arg_value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151359": {
+      "content": "</arg_value>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151360": {
+      "content": "/nothink",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151361": {
+      "content": "<|begin_of_box|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151362": {
+      "content": "<|end_of_box|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151363": {
+      "content": "<|image|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151364": {
+      "content": "<|video|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|endoftext|>",
+    "[MASK]",
+    "[gMASK]",
+    "[sMASK]",
+    "<sop>",
+    "<eop>",
+    "<|system|>",
+    "<|user|>",
+    "<|assistant|>",
+    "<|observation|>",
+    "<|begin_of_image|>",
+    "<|end_of_image|>",
+    "<|begin_of_video|>",
+    "<|end_of_video|>",
+    "<|begin_of_audio|>",
+    "<|end_of_audio|>",
+    "<|begin_of_transcription|>",
+    "<|end_of_transcription|>",
+    "<|code_prefix|>",
+    "<|code_middle|>",
+    "<|code_suffix|>",
+    "/nothink"
+  ],
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "<|endoftext|>",
+  "extra_special_tokens": {},
+  "model_max_length": 128000,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "left",
+  "remove_space": false,
+  "tokenizer_class": "PreTrainedTokenizerFast"
+}