Batch processing
Can anyone share the batch processing code or refer me to it.
Thanks
interested as well!
inputs = self.processor.apply_chat_template(
promote_t,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
padding=True
)
input_ids = inputs.input_ids.squeeze(0)
attention_mask = inputs.attention_mask.squeeze(0)
pixel_values = inputs.pixel_values.squeeze(0) if 'pixel_values' in inputs else None
image_grid_thw = inputs.image_grid_thw.squeeze(0) if 'image_grid_thw' in inputs else None
yield {
"input_ids": input_ids,
"attention_mask": attention_mask,
"pixel_values": pixel_values,
"image_grid_thw": image_grid_thw,
"meta": (rawid, name, pic_url)
}
以上处理 数据,返回一个迭代器, promote_t 跟官网 单条格式一样就可以,注意:图片一定要resize 一下,大小要一样
下面批量合并函数
def collate_fn(batch, pad_token_id):
"""
左填充实现:把每条样本右对齐(填充在左侧)。
batch: list of items from Dataset.getitem
返回 dict(input_ids, attention_mask, metas)
"""
batch_size = len(batch)
lengths = [item["input_ids"].size(0) for item in batch]
max_len = max(lengths)
input_ids_padded = torch.full((batch_size, max_len), pad_token_id, dtype=torch.long)
attention_mask_padded = torch.zeros((batch_size, max_len), dtype=torch.long)
pixel_values_list = []
image_grid_thw_list = []
metas = []
for i, item in enumerate(batch):
seq = item["input_ids"]
L = seq.size(0)
input_ids_padded[i, max_len - L:] = seq # 右对齐,左侧为 pad
attention_mask_padded[i, max_len - L:] = item["attention_mask"]
pixel_values_list.append(item["pixel_values"])
image_grid_thw_list.append(item["image_grid_thw"])
metas.append(item["meta"])
pixel_values = torch.stack(pixel_values_list, dim=0) # B x C x H x W
image_grid_thw = torch.stack(image_grid_thw_list, dim=0)
# print()
return {
"input_ids": input_ids_padded,
"attention_mask": attention_mask_padded,
"pixel_values": pixel_values,
"image_grid_thw": image_grid_thw,
"metas": metas
}
MyIterableDataset 关键代码上面以提供
dataset = MyIterableDataset(INPUT_FILE, processor, rank, world_size)
dataloader = DataLoader(
dataset,
batch_size=batch_size,
# sampler=sampler,
num_workers=NUM_WORKERS,
pin_memory=True,
collate_fn=lambda b: collate_fn(b, pad_token_id),
persistent_workers=PERSISTENT_WORKERS if NUM_WORKERS > 0 else False,
)
预测代码
for batch in tqdm(dataloader, desc=f"Rank {rank} infer"):
# 把 inputs 移到 GPU,使用 non_blocking=True(需要 pin_memory=True)
input_ids = batch["input_ids"].to(device_str, non_blocking=True)
attention_mask = batch["attention_mask"].to(device_str, non_blocking=True)
pixel_values = batch['pixel_values'].to(device_str, non_blocking=True)
image_grid_thw = batch['image_grid_thw'].to(device_str, non_blocking=True)
metas = batch["metas"]
seq_len = input_ids.size(1) # 输入序列的长度(包含左 pad)
with torch.no_grad():
generated = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw,
max_new_tokens=MAX_NEW_TOKENS,
pad_token_id=pad_token_id,
# use_cache=False,
)
# 按 seq_len 裁剪生成的后缀部分
batch_texts = []
for i in range(generated.size(0)):
gen_ids = generated[i, seq_len:].cpu().numpy().tolist()
batch_texts.append(gen_ids)
# 删除结果
# del generated
# decode(processor 在每个进程中已经可用)
decoded_texts = processor.batch_decode(
batch_texts, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
Yes, I experience the same issue.
@longjun1118
, Unfortunately, the problem persists in your code as well.
Description
I'm experiencing inconsistent outputs when comparing individual inference vs batch inference with Qwen3-VL-2B-Instruct. Despite using deterministic settings (temperature=0.0, do_sample=False, num_beams=1), one sample produces different results depending on whether it's processed individually or in a batch.
Issue Details
- Sample 1: Outputs differ between individual (1153 chars) and batch inference (1159 chars) ❌
- Sample 2: Outputs match perfectly ✓
- Using the same image and prompts in both modes
- Tried both left and right padding - neither resolves the issue
Environment
- Model:
Qwen/Qwen3-VL-2B-Instruct - Device: CUDA
- Precision:
torch.bfloat16 - Attention:
flash_attention_2
Reproducible Code
import torch
from qwen_vl_utils import process_vision_info
from transformers import AutoModelForImageTextToText, AutoProcessor
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Load model and processor
model_id = "Qwen/Qwen3-VL-2B-Instruct"
print(f"\nLoading model: {model_id}")
model = AutoModelForImageTextToText.from_pretrained(
model_id,
dtype=torch.bfloat16,
device_map="auto",
attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer.padding_side = "left" # Tried both "left" and "right"
messages1 = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
},
{"type": "text", "text": "Describe this image in detail."},
],
}
]
messages2 = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
},
{"type": "text", "text": "What do you see in this image? Please provide a comprehensive description."},
],
}
]
# Combine messages for batch processing
messages = [messages1, messages2]
# Run each sample separately
print("\n=== INDIVIDUAL INFERENCE ===")
individual_outputs = []
for idx, msg in enumerate([messages1, messages2], 1):
text = processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
image_inputs, _ = process_vision_info([msg])
inputs = processor(
text=[text],
images=image_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=256, num_beams=1, do_sample=False, temperature=0.0)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
individual_outputs.append(output_text[0])
print(f"Sample {idx}: {output_text[0]}")
# Batch Inference
print("\n=== BATCH INFERENCE ===")
texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
image_inputs, _ = process_vision_info(messages)
inputs = processor(
text=texts,
images=image_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
generated_ids = model.generate(**inputs, max_new_tokens=256, num_beams=1, do_sample=False, temperature=0.0)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
for idx, output in enumerate(output_texts, 1):
print(f"Sample {idx}: {output}")
# Text Comparison
print("\n=== COMPARISON ===")
for idx in range(len(output_texts)):
match = individual_outputs[idx] == output_texts[idx]
status = "✓ MATCH" if match else "✗ MISMATCH"
print(f"Sample {idx + 1}: {status}")
if not match:
print(f" Length: {len(individual_outputs[idx])} vs {len(output_texts[idx])}")
all_match = all(individual_outputs[i] == output_texts[i] for i in range(len(output_texts)))
print(f"\nOverall: {'✓ All match' if all_match else '✗ Differences found'}")
Output
=== COMPARISON ===
Sample 1: ✗ MISMATCH
Length: 1153 vs 1159
Sample 2: ✓ MATCH
Overall: ✗ Differences found
What I've Tried
✅ Set temperature=0.0, do_sample=False, num_beams=1 for deterministic generation
✅ Tried padding_side="left" - still produces mismatches
✅ Tried padding_side="right" - still produces mismatches
✅ Used the same generation parameters in both individual and batch modes