Spaces:
Sleeping
Sleeping
Nursing Citizen Development
commited on
Commit
·
9af128b
1
Parent(s):
99d19a9
Fix: Use greedy decoding to avoid CUDA NaN probability error
Browse files- medgemma_client.py +1 -2
medgemma_client.py
CHANGED
|
@@ -139,8 +139,7 @@ def _local_inference(messages: list, max_tokens: int = 2048) -> str:
|
|
| 139 |
outputs = model.generate(
|
| 140 |
**inputs,
|
| 141 |
max_new_tokens=max_tokens,
|
| 142 |
-
do_sample=
|
| 143 |
-
temperature=0.1
|
| 144 |
)
|
| 145 |
|
| 146 |
# Decode only the new tokens (skip input)
|
|
|
|
| 139 |
outputs = model.generate(
|
| 140 |
**inputs,
|
| 141 |
max_new_tokens=max_tokens,
|
| 142 |
+
do_sample=False # Greedy decoding - more stable with quantized models
|
|
|
|
| 143 |
)
|
| 144 |
|
| 145 |
# Decode only the new tokens (skip input)
|