zhang jiejing
commited on
update readme for fp8
Browse files
README.md
CHANGED
|
@@ -153,7 +153,7 @@ docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
|
|
| 153 |
|
| 154 |
- Download Model file:
|
| 155 |
- Huggingface: will download automicly by vllm.
|
| 156 |
-
- ModelScope: `modelscope download --model Tencent-Hunyuan/Hunyuan-A13B-Instruct`
|
| 157 |
|
| 158 |
|
| 159 |
- Start the API server:
|
|
@@ -165,7 +165,7 @@ docker run --privileged --user root --net=host --ipc=host \
|
|
| 165 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
|
| 166 |
\
|
| 167 |
-m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
|
| 168 |
-
--tensor-parallel-size
|
| 169 |
|
| 170 |
```
|
| 171 |
|
|
@@ -174,8 +174,9 @@ model downloaded by modelscope:
|
|
| 174 |
docker run --privileged --user root --net=host --ipc=host \
|
| 175 |
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
| 176 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm \
|
| 177 |
-
-m vllm.entrypoints.openai.api_server --host 0.0.0.0
|
| 178 |
-
|
|
|
|
| 179 |
```
|
| 180 |
|
| 181 |
|
|
@@ -190,7 +191,13 @@ To get started:
|
|
| 190 |
- Pull the Docker image
|
| 191 |
|
| 192 |
```
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
```
|
| 195 |
|
| 196 |
- Start the API server:
|
|
@@ -200,8 +207,8 @@ docker run --gpus all \
|
|
| 200 |
--shm-size 32g \
|
| 201 |
-p 30000:30000 \
|
| 202 |
--ipc=host \
|
| 203 |
-
|
| 204 |
-
-m sglang.launch_server --model-path hunyuan/
|
| 205 |
```
|
| 206 |
|
| 207 |
|
|
|
|
| 153 |
|
| 154 |
- Download Model file:
|
| 155 |
- Huggingface: will download automicly by vllm.
|
| 156 |
+
- ModelScope: `modelscope download --model Tencent-Hunyuan/Hunyuan-A13B-Instruct-FP8`
|
| 157 |
|
| 158 |
|
| 159 |
- Start the API server:
|
|
|
|
| 165 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm
|
| 166 |
\
|
| 167 |
-m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
|
| 168 |
+
--tensor-parallel-size 2 --dtype bfloat16 --kv-cache-dtype fp8 --model tencent/Hunyuan-A13B-Instruct-FP8 --trust-remote-code
|
| 169 |
|
| 170 |
```
|
| 171 |
|
|
|
|
| 174 |
docker run --privileged --user root --net=host --ipc=host \
|
| 175 |
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
| 176 |
--gpus=all -it --entrypoint python hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-vllm \
|
| 177 |
+
-m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 \
|
| 178 |
+
--tensor-parallel-size 2 --dtype bfloat16 --kv-cache-dtype fp8 \
|
| 179 |
+
--model /root/.cache/modelscope/hub/models/Tencent-Hunyuan/Hunyuan-A13B-Instruct-FP8 --trust_remote_code
|
| 180 |
```
|
| 181 |
|
| 182 |
|
|
|
|
| 191 |
- Pull the Docker image
|
| 192 |
|
| 193 |
```
|
| 194 |
+
|
| 195 |
+
# china mirror
|
| 196 |
+
docker pull docker.cnb.cool/tencent/hunyuan/hunyuan-a13b:hunyuan-moe-A13B-sglang
|
| 197 |
+
|
| 198 |
+
# docker hub:
|
| 199 |
+
docker pull hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-sglang
|
| 200 |
+
|
| 201 |
```
|
| 202 |
|
| 203 |
- Start the API server:
|
|
|
|
| 207 |
--shm-size 32g \
|
| 208 |
-p 30000:30000 \
|
| 209 |
--ipc=host \
|
| 210 |
+
hunyuaninfer/hunyuan-a13b:hunyuan-moe-A13B-sglang \
|
| 211 |
+
-m sglang.launch_server --model-path hunyuan/Hunyuan-A13B-Instruct-FP8 --tp 2 --trust-remote-code --host 0.0.0.0 --port 30000
|
| 212 |
```
|
| 213 |
|
| 214 |
|