Batch processing

#12
by cora-17 - opened

Can anyone share the batch processing code or refer me to it.

Thanks

interested as well!

inputs = self.processor.apply_chat_template(
promote_t,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
padding=True
)
input_ids = inputs.input_ids.squeeze(0)
attention_mask = inputs.attention_mask.squeeze(0)
pixel_values = inputs.pixel_values.squeeze(0) if 'pixel_values' in inputs else None
image_grid_thw = inputs.image_grid_thw.squeeze(0) if 'image_grid_thw' in inputs else None

        yield {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "pixel_values": pixel_values,
            "image_grid_thw": image_grid_thw,
            "meta": (rawid, name, pic_url)
        }

以上处理 数据,返回一个迭代器, promote_t 跟官网 单条格式一样就可以,注意:图片一定要resize 一下,大小要一样
下面批量合并函数
def collate_fn(batch, pad_token_id):
"""
左填充实现:把每条样本右对齐(填充在左侧)。
batch: list of items from Dataset.getitem
返回 dict(input_ids, attention_mask, metas)
"""
batch_size = len(batch)
lengths = [item["input_ids"].size(0) for item in batch]
max_len = max(lengths)
input_ids_padded = torch.full((batch_size, max_len), pad_token_id, dtype=torch.long)
attention_mask_padded = torch.zeros((batch_size, max_len), dtype=torch.long)
pixel_values_list = []
image_grid_thw_list = []
metas = []
for i, item in enumerate(batch):
seq = item["input_ids"]
L = seq.size(0)
input_ids_padded[i, max_len - L:] = seq # 右对齐,左侧为 pad
attention_mask_padded[i, max_len - L:] = item["attention_mask"]
pixel_values_list.append(item["pixel_values"])
image_grid_thw_list.append(item["image_grid_thw"])
metas.append(item["meta"])
pixel_values = torch.stack(pixel_values_list, dim=0) # B x C x H x W
image_grid_thw = torch.stack(image_grid_thw_list, dim=0)
# print()
return {
"input_ids": input_ids_padded,
"attention_mask": attention_mask_padded,
"pixel_values": pixel_values,
"image_grid_thw": image_grid_thw,
"metas": metas
}

MyIterableDataset 关键代码上面以提供

dataset = MyIterableDataset(INPUT_FILE, processor, rank, world_size)

dataloader = DataLoader(
    dataset,
    batch_size=batch_size,
    # sampler=sampler,
    num_workers=NUM_WORKERS,
    pin_memory=True,
    collate_fn=lambda b: collate_fn(b, pad_token_id),
    persistent_workers=PERSISTENT_WORKERS if NUM_WORKERS > 0 else False,
)

预测代码
for batch in tqdm(dataloader, desc=f"Rank {rank} infer"):
# 把 inputs 移到 GPU,使用 non_blocking=True(需要 pin_memory=True)
input_ids = batch["input_ids"].to(device_str, non_blocking=True)
attention_mask = batch["attention_mask"].to(device_str, non_blocking=True)
pixel_values = batch['pixel_values'].to(device_str, non_blocking=True)
image_grid_thw = batch['image_grid_thw'].to(device_str, non_blocking=True)
metas = batch["metas"]

    seq_len = input_ids.size(1)  # 输入序列的长度(包含左 pad)

    with torch.no_grad():
        generated = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            pixel_values=pixel_values,
            image_grid_thw=image_grid_thw,
            max_new_tokens=MAX_NEW_TOKENS,
            pad_token_id=pad_token_id,
            # use_cache=False,
        )

    # 按 seq_len 裁剪生成的后缀部分
    batch_texts = []
    for i in range(generated.size(0)):
        gen_ids = generated[i, seq_len:].cpu().numpy().tolist()
        batch_texts.append(gen_ids)
    # 删除结果
    # del generated

    # decode(processor 在每个进程中已经可用)
    decoded_texts = processor.batch_decode(
        batch_texts, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )

Yes, I experience the same issue.
@longjun1118 , Unfortunately, the problem persists in your code as well.

Description

I'm experiencing inconsistent outputs when comparing individual inference vs batch inference with Qwen3-VL-2B-Instruct. Despite using deterministic settings (temperature=0.0, do_sample=False, num_beams=1), one sample produces different results depending on whether it's processed individually or in a batch.

Issue Details

  • Sample 1: Outputs differ between individual (1153 chars) and batch inference (1159 chars) ❌
  • Sample 2: Outputs match perfectly ✓
  • Using the same image and prompts in both modes
  • Tried both left and right padding - neither resolves the issue

Environment

  • Model: Qwen/Qwen3-VL-2B-Instruct
  • Device: CUDA
  • Precision: torch.bfloat16
  • Attention: flash_attention_2

Reproducible Code

import torch
from qwen_vl_utils import process_vision_info
from transformers import AutoModelForImageTextToText, AutoProcessor

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model and processor
model_id = "Qwen/Qwen3-VL-2B-Instruct"
print(f"\nLoading model: {model_id}")
model = AutoModelForImageTextToText.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto",
    attn_implementation="flash_attention_2",
)
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer.padding_side = "left"  # Tried both "left" and "right"


messages1 = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
            },
            {"type": "text", "text": "Describe this image in detail."},
        ],
    }
]
messages2 = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg",
            },
            {"type": "text", "text": "What do you see in this image? Please provide a comprehensive description."},
        ],
    }
]
# Combine messages for batch processing
messages = [messages1, messages2]

# Run each sample separately
print("\n=== INDIVIDUAL INFERENCE ===")

individual_outputs = []
for idx, msg in enumerate([messages1, messages2], 1):
    text = processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
    image_inputs, _ = process_vision_info([msg])
    inputs = processor(
        text=[text],
        images=image_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    generated_ids = model.generate(**inputs, max_new_tokens=256, num_beams=1, do_sample=False, temperature=0.0)
    generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    individual_outputs.append(output_text[0])
    print(f"Sample {idx}: {output_text[0]}")

# Batch Inference
print("\n=== BATCH INFERENCE ===")

texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
image_inputs, _ = process_vision_info(messages)
inputs = processor(
    text=texts,
    images=image_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=256, num_beams=1, do_sample=False, temperature=0.0)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_texts = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
for idx, output in enumerate(output_texts, 1):
    print(f"Sample {idx}: {output}")

# Text Comparison
print("\n=== COMPARISON ===")
for idx in range(len(output_texts)):
    match = individual_outputs[idx] == output_texts[idx]
    status = "✓ MATCH" if match else "✗ MISMATCH"
    print(f"Sample {idx + 1}: {status}")
    if not match:
        print(f"  Length: {len(individual_outputs[idx])} vs {len(output_texts[idx])}")

all_match = all(individual_outputs[i] == output_texts[i] for i in range(len(output_texts)))
print(f"\nOverall: {'✓ All match' if all_match else '✗ Differences found'}")

Output

=== COMPARISON ===
Sample 1: ✗ MISMATCH
  Length: 1153 vs 1159
Sample 2: ✓ MATCH

Overall: ✗ Differences found

What I've Tried

✅ Set temperature=0.0, do_sample=False, num_beams=1 for deterministic generation
✅ Tried padding_side="left" - still produces mismatches
✅ Tried padding_side="right" - still produces mismatches
✅ Used the same generation parameters in both individual and batch modes

How to run Qwen3-VL model in batch mode with transformers?

Sign up or log in to comment