Nursing Citizen Development commited on
Commit
9af128b
·
1 Parent(s): 99d19a9

Fix: Use greedy decoding to avoid CUDA NaN probability error

Browse files
Files changed (1) hide show
  1. medgemma_client.py +1 -2
medgemma_client.py CHANGED
@@ -139,8 +139,7 @@ def _local_inference(messages: list, max_tokens: int = 2048) -> str:
139
  outputs = model.generate(
140
  **inputs,
141
  max_new_tokens=max_tokens,
142
- do_sample=True,
143
- temperature=0.1
144
  )
145
 
146
  # Decode only the new tokens (skip input)
 
139
  outputs = model.generate(
140
  **inputs,
141
  max_new_tokens=max_tokens,
142
+ do_sample=False # Greedy decoding - more stable with quantized models
 
143
  )
144
 
145
  # Decode only the new tokens (skip input)