Upload folder using huggingface_hub
Browse files- .hfd/aria2c_urls.txt +0 -0
- .hfd/last_download_command +1 -0
- .hfd/repo_metadata.json +1 -0
- configuration_r.py +0 -3
- modeling_r.py +2 -84
- processing_r.py +4 -31
.hfd/aria2c_urls.txt
ADDED
|
File without changes
|
.hfd/last_download_command
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
REPO_ID=YannQi/R-4B TOOL=aria2c INCLUDE_PATTERNS= EXCLUDE_PATTERNS= DATASET=0 HF_USERNAME= HF_TOKEN= HF_TOKEN=https://huggingface.co REVISION=main
|
.hfd/repo_metadata.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"_id":"6899c7b833b8a4a0398a0ed2","id":"YannQi/R-4B","private":false,"pipeline_tag":"visual-question-answering","tags":["safetensors","R","visual-question-answering","custom_code","en","base_model:Qwen/Qwen3-4B","base_model:finetune:Qwen/Qwen3-4B","license:apache-2.0","region:us"],"downloads":0,"likes":3,"modelId":"YannQi/R-4B","author":"YannQi","sha":"9fcd58d9d7b03add99ea92df619b24fa60a0e1ac","lastModified":"2025-08-11T11:55:23.000Z","gated":false,"disabled":false,"model-index":null,"config":{"auto_map":{"AutoConfig":"configuration_r.RConfig","AutoModel":"modeling_r.RForConditionalGeneration","AutoModelForCausalLM":"modeling_r.RForConditionalGeneration"},"architectures":["RForConditionalGeneration"],"model_type":"R","tokenizer_config":{"bos_token":null,"eos_token":"<|im_end|>","pad_token":"<|endoftext|>","unk_token":null},"chat_template_jinja":"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n'}}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all video then #}{% for content in message['content'] | selectattr('type', 'equalto', 'video') %}{{ '<video>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] }}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] }}{% endgeneration %}{% endfor %}{% endif %}{{'<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n<think>' }}{% endif %}{%- if add_generation_prompt %}{%- if thinking_mode is defined and thinking_mode == 'short' %}{{- '\n\n</think>\n\n' }}{%- endif %}{%- if thinking_mode is defined and thinking_mode == 'long' %}{{- '\n' }}{%- endif %}{%- endif %}\n"},"cardData":{"license":"apache-2.0","language":["en"],"base_model":["Qwen/Qwen3-4B"],"pipeline_tag":"visual-question-answering"},"siblings":[{"rfilename":".gitattributes"},{"rfilename":"README.md"},{"rfilename":"added_tokens.json"},{"rfilename":"asset/R-4B.png"},{"rfilename":"asset/performance.png"},{"rfilename":"chat_template.jinja"},{"rfilename":"config.json"},{"rfilename":"configuration_r.py"},{"rfilename":"generation_config.json"},{"rfilename":"image_processing_r.py"},{"rfilename":"image_processing_r_fast.py"},{"rfilename":"merges.txt"},{"rfilename":"model-00001-of-00003.safetensors"},{"rfilename":"model-00002-of-00003.safetensors"},{"rfilename":"model-00003-of-00003.safetensors"},{"rfilename":"model.safetensors.index.json"},{"rfilename":"modeling_r.py"},{"rfilename":"preprocessor_config.json"},{"rfilename":"processing_r.py"},{"rfilename":"processor_config.json"},{"rfilename":"special_tokens_map.json"},{"rfilename":"tokenizer.json"},{"rfilename":"tokenizer_config.json"},{"rfilename":"video_preprocessor_config.json"},{"rfilename":"vocab.json"}],"spaces":[],"createdAt":"2025-08-11T10:36:40.000Z","safetensors":{"parameters":{"BF16":4819012384},"total":4819012384},"usedStorage":9653302738}
|
configuration_r.py
CHANGED
|
@@ -27,7 +27,6 @@ class RConfig(PretrainedConfig):
|
|
| 27 |
model_type = "R"
|
| 28 |
attribute_map = {
|
| 29 |
"image_token_id": "image_token_index",
|
| 30 |
-
"video_token_id": "video_token_index",
|
| 31 |
}
|
| 32 |
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
| 33 |
|
|
@@ -36,7 +35,6 @@ class RConfig(PretrainedConfig):
|
|
| 36 |
vision_config=None,
|
| 37 |
text_config=None,
|
| 38 |
image_token_index=151646,
|
| 39 |
-
video_token_index=151647,
|
| 40 |
projector_hidden_act="gelu",
|
| 41 |
vision_feature_select_strategy="full",
|
| 42 |
vision_feature_layer=-1,
|
|
@@ -48,7 +46,6 @@ class RConfig(PretrainedConfig):
|
|
| 48 |
**kwargs,
|
| 49 |
):
|
| 50 |
self.image_token_index = image_token_index
|
| 51 |
-
self.video_token_index = video_token_index
|
| 52 |
self.projector_hidden_act = projector_hidden_act
|
| 53 |
self.multimodal_projector_bias = multimodal_projector_bias
|
| 54 |
|
|
|
|
| 27 |
model_type = "R"
|
| 28 |
attribute_map = {
|
| 29 |
"image_token_id": "image_token_index",
|
|
|
|
| 30 |
}
|
| 31 |
sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
|
| 32 |
|
|
|
|
| 35 |
vision_config=None,
|
| 36 |
text_config=None,
|
| 37 |
image_token_index=151646,
|
|
|
|
| 38 |
projector_hidden_act="gelu",
|
| 39 |
vision_feature_select_strategy="full",
|
| 40 |
vision_feature_layer=-1,
|
|
|
|
| 46 |
**kwargs,
|
| 47 |
):
|
| 48 |
self.image_token_index = image_token_index
|
|
|
|
| 49 |
self.projector_hidden_act = projector_hidden_act
|
| 50 |
self.multimodal_projector_bias = multimodal_projector_bias
|
| 51 |
|
modeling_r.py
CHANGED
|
@@ -44,8 +44,6 @@ class RModelOutputWithPast(BaseModelOutputWithPast):
|
|
| 44 |
|
| 45 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
| 46 |
|
| 47 |
-
video_hidden_states: Optional[torch.FloatTensor] = None
|
| 48 |
-
|
| 49 |
|
| 50 |
@dataclass
|
| 51 |
class RCausalLMOutputWithPast(ModelOutput):
|
|
@@ -57,8 +55,6 @@ class RCausalLMOutputWithPast(ModelOutput):
|
|
| 57 |
attentions: Optional[tuple[torch.FloatTensor]] = None
|
| 58 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
| 59 |
|
| 60 |
-
video_hidden_states: Optional[torch.FloatTensor] = None
|
| 61 |
-
|
| 62 |
|
| 63 |
class RPooler(nn.Module):
|
| 64 |
def __init__(self, config):
|
|
@@ -364,8 +360,6 @@ class RModel(RPreTrainedModel):
|
|
| 364 |
input_ids: torch.LongTensor = None,
|
| 365 |
pixel_values: torch.FloatTensor = None,
|
| 366 |
image_sizes: Optional[torch.LongTensor] = None,
|
| 367 |
-
pixel_values_videos: torch.FloatTensor = None,
|
| 368 |
-
image_sizes_videos: Optional[torch.LongTensor] = None,
|
| 369 |
attention_mask: Optional[torch.Tensor] = None,
|
| 370 |
position_ids: Optional[torch.LongTensor] = None,
|
| 371 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
@@ -403,9 +397,9 @@ class RModel(RPreTrainedModel):
|
|
| 403 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 404 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
| 405 |
|
| 406 |
-
if
|
| 407 |
raise ValueError(
|
| 408 |
-
"You cannot specify both `pixel_values
|
| 409 |
"and must specify either one"
|
| 410 |
)
|
| 411 |
if inputs_embeds is None:
|
|
@@ -434,30 +428,6 @@ class RModel(RPreTrainedModel):
|
|
| 434 |
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
| 435 |
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
| 436 |
|
| 437 |
-
# Video are simply embedded and further pooled to decrease seq len
|
| 438 |
-
if pixel_values_videos is not None:
|
| 439 |
-
video_features = self.get_video_features(
|
| 440 |
-
pixel_values_videos,
|
| 441 |
-
vision_feature_layer=vision_feature_layer,
|
| 442 |
-
vision_feature_select_strategy=vision_feature_select_strategy,
|
| 443 |
-
)
|
| 444 |
-
image_newline = (
|
| 445 |
-
self.image_newline[None, None, :].repeat(video_features.shape[0], 1, 1).to(video_features.device)
|
| 446 |
-
)
|
| 447 |
-
video_features = torch.cat((video_features, image_newline), dim=1)
|
| 448 |
-
video_features = video_features.flatten(0, 1)
|
| 449 |
-
|
| 450 |
-
special_video_mask = (input_ids == self.config.video_token_id).unsqueeze(-1)
|
| 451 |
-
special_video_mask = special_video_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
|
| 452 |
-
if not is_torchdynamo_compiling() and inputs_embeds[special_video_mask].numel() != video_features.numel():
|
| 453 |
-
n_video_tokens = (input_ids == self.config.video_token_id).sum()
|
| 454 |
-
n_video_features = video_features.shape[0]
|
| 455 |
-
raise ValueError(
|
| 456 |
-
f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
|
| 457 |
-
)
|
| 458 |
-
video_features = video_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
| 459 |
-
inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
|
| 460 |
-
|
| 461 |
outputs = self.language_model(
|
| 462 |
attention_mask=attention_mask,
|
| 463 |
position_ids=position_ids,
|
|
@@ -477,7 +447,6 @@ class RModel(RPreTrainedModel):
|
|
| 477 |
hidden_states=outputs.hidden_states,
|
| 478 |
attentions=outputs.attentions,
|
| 479 |
image_hidden_states=image_features if pixel_values is not None else None,
|
| 480 |
-
video_hidden_states=video_features if pixel_values_videos is not None else None,
|
| 481 |
)
|
| 482 |
|
| 483 |
def apply_pooling(self, image_features):
|
|
@@ -494,36 +463,6 @@ class RModel(RPreTrainedModel):
|
|
| 494 |
image_features = image_features.view(batch_frames, -1, dim)
|
| 495 |
return image_features
|
| 496 |
|
| 497 |
-
def get_video_features(
|
| 498 |
-
self,
|
| 499 |
-
pixel_values: torch.FloatTensor,
|
| 500 |
-
vision_feature_layer: Union[int, list[int]],
|
| 501 |
-
vision_feature_select_strategy: str,
|
| 502 |
-
):
|
| 503 |
-
batch_size, frames, channels, height, width = pixel_values.shape
|
| 504 |
-
pixel_values = pixel_values.view(batch_size * frames, channels, height, width)
|
| 505 |
-
video_features = self.vision_tower(pixel_values, output_hidden_states=True)
|
| 506 |
-
|
| 507 |
-
# If we have one vision feature layer, return the corresponding hidden states,
|
| 508 |
-
# otherwise, select the hidden states of each feature layer and concatenate them
|
| 509 |
-
if isinstance(vision_feature_layer, int):
|
| 510 |
-
selected_video_feature = video_features.hidden_states[vision_feature_layer]
|
| 511 |
-
else:
|
| 512 |
-
hs_pool = [video_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
|
| 513 |
-
selected_video_feature = torch.cat(hs_pool, dim=-1)
|
| 514 |
-
|
| 515 |
-
if vision_feature_select_strategy == "default":
|
| 516 |
-
selected_video_feature = selected_video_feature[:, 1:]
|
| 517 |
-
elif vision_feature_select_strategy == "full":
|
| 518 |
-
selected_video_feature = selected_video_feature
|
| 519 |
-
video_features = self.multi_modal_projector(selected_video_feature)
|
| 520 |
-
|
| 521 |
-
video_features = self.apply_pooling(video_features)
|
| 522 |
-
video_features = video_features.reshape(batch_size, frames * video_features.shape[1], -1)
|
| 523 |
-
|
| 524 |
-
return video_features
|
| 525 |
-
|
| 526 |
-
|
| 527 |
class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
| 528 |
_checkpoint_conversion_mapping = {
|
| 529 |
"^language_model.model": "model.language_model",
|
|
@@ -599,8 +538,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
| 599 |
input_ids: torch.LongTensor = None,
|
| 600 |
pixel_values: torch.FloatTensor = None,
|
| 601 |
image_sizes: Optional[torch.LongTensor] = None,
|
| 602 |
-
pixel_values_videos: torch.FloatTensor = None,
|
| 603 |
-
image_sizes_videos: Optional[torch.LongTensor] = None,
|
| 604 |
attention_mask: Optional[torch.Tensor] = None,
|
| 605 |
position_ids: Optional[torch.LongTensor] = None,
|
| 606 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
@@ -641,9 +578,7 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
| 641 |
outputs = self.model(
|
| 642 |
input_ids=input_ids,
|
| 643 |
pixel_values=pixel_values,
|
| 644 |
-
pixel_values_videos=pixel_values_videos,
|
| 645 |
image_sizes=image_sizes,
|
| 646 |
-
image_sizes_videos=image_sizes_videos,
|
| 647 |
vision_aspect_ratio=vision_aspect_ratio,
|
| 648 |
vision_feature_layer=vision_feature_layer,
|
| 649 |
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
@@ -679,7 +614,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
| 679 |
hidden_states=outputs.hidden_states,
|
| 680 |
attentions=outputs.attentions,
|
| 681 |
image_hidden_states=outputs.image_hidden_states,
|
| 682 |
-
video_hidden_states=outputs.video_hidden_states,
|
| 683 |
)
|
| 684 |
|
| 685 |
def prepare_inputs_for_generation(
|
|
@@ -689,8 +623,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
| 689 |
inputs_embeds=None,
|
| 690 |
pixel_values=None,
|
| 691 |
image_sizes=None,
|
| 692 |
-
pixel_values_videos=None,
|
| 693 |
-
image_sizes_videos=None,
|
| 694 |
attention_mask=None,
|
| 695 |
cache_position=None,
|
| 696 |
logits_to_keep=None,
|
|
@@ -713,8 +645,6 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
| 713 |
# Otherwise we need pixel values to be passed to model
|
| 714 |
model_inputs["pixel_values"] = pixel_values
|
| 715 |
model_inputs["image_sizes"] = image_sizes
|
| 716 |
-
model_inputs["pixel_values_videos"] = pixel_values_videos
|
| 717 |
-
model_inputs["image_sizes_videos"] = image_sizes_videos
|
| 718 |
|
| 719 |
return model_inputs
|
| 720 |
|
|
@@ -754,17 +684,5 @@ class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
|
| 754 |
|
| 755 |
return causal_mask
|
| 756 |
|
| 757 |
-
def get_video_features(
|
| 758 |
-
self,
|
| 759 |
-
pixel_values: torch.FloatTensor,
|
| 760 |
-
vision_feature_layer: Optional[Union[int, list[int]]] = None,
|
| 761 |
-
vision_feature_select_strategy: Optional[str] = None,
|
| 762 |
-
):
|
| 763 |
-
return self.model.get_video_features(
|
| 764 |
-
pixel_values=pixel_values,
|
| 765 |
-
vision_feature_layer=vision_feature_layer,
|
| 766 |
-
vision_feature_select_strategy=vision_feature_select_strategy,
|
| 767 |
-
)
|
| 768 |
-
|
| 769 |
|
| 770 |
__all__ = ["RModel", "RForConditionalGeneration", "RPreTrainedModel"]
|
|
|
|
| 44 |
|
| 45 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
| 46 |
|
|
|
|
|
|
|
| 47 |
|
| 48 |
@dataclass
|
| 49 |
class RCausalLMOutputWithPast(ModelOutput):
|
|
|
|
| 55 |
attentions: Optional[tuple[torch.FloatTensor]] = None
|
| 56 |
image_hidden_states: Optional[torch.FloatTensor] = None
|
| 57 |
|
|
|
|
|
|
|
| 58 |
|
| 59 |
class RPooler(nn.Module):
|
| 60 |
def __init__(self, config):
|
|
|
|
| 360 |
input_ids: torch.LongTensor = None,
|
| 361 |
pixel_values: torch.FloatTensor = None,
|
| 362 |
image_sizes: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
| 363 |
attention_mask: Optional[torch.Tensor] = None,
|
| 364 |
position_ids: Optional[torch.LongTensor] = None,
|
| 365 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
|
|
| 397 |
if (input_ids is None) ^ (inputs_embeds is not None):
|
| 398 |
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
| 399 |
|
| 400 |
+
if pixel_values is not None and inputs_embeds is not None:
|
| 401 |
raise ValueError(
|
| 402 |
+
"You cannot specify both `pixel_values` and `inputs_embeds` at the same time, "
|
| 403 |
"and must specify either one"
|
| 404 |
)
|
| 405 |
if inputs_embeds is None:
|
|
|
|
| 428 |
image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
| 429 |
inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
outputs = self.language_model(
|
| 432 |
attention_mask=attention_mask,
|
| 433 |
position_ids=position_ids,
|
|
|
|
| 447 |
hidden_states=outputs.hidden_states,
|
| 448 |
attentions=outputs.attentions,
|
| 449 |
image_hidden_states=image_features if pixel_values is not None else None,
|
|
|
|
| 450 |
)
|
| 451 |
|
| 452 |
def apply_pooling(self, image_features):
|
|
|
|
| 463 |
image_features = image_features.view(batch_frames, -1, dim)
|
| 464 |
return image_features
|
| 465 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
class RForConditionalGeneration(RPreTrainedModel, GenerationMixin):
|
| 467 |
_checkpoint_conversion_mapping = {
|
| 468 |
"^language_model.model": "model.language_model",
|
|
|
|
| 538 |
input_ids: torch.LongTensor = None,
|
| 539 |
pixel_values: torch.FloatTensor = None,
|
| 540 |
image_sizes: Optional[torch.LongTensor] = None,
|
|
|
|
|
|
|
| 541 |
attention_mask: Optional[torch.Tensor] = None,
|
| 542 |
position_ids: Optional[torch.LongTensor] = None,
|
| 543 |
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
|
|
|
| 578 |
outputs = self.model(
|
| 579 |
input_ids=input_ids,
|
| 580 |
pixel_values=pixel_values,
|
|
|
|
| 581 |
image_sizes=image_sizes,
|
|
|
|
| 582 |
vision_aspect_ratio=vision_aspect_ratio,
|
| 583 |
vision_feature_layer=vision_feature_layer,
|
| 584 |
vision_feature_select_strategy=vision_feature_select_strategy,
|
|
|
|
| 614 |
hidden_states=outputs.hidden_states,
|
| 615 |
attentions=outputs.attentions,
|
| 616 |
image_hidden_states=outputs.image_hidden_states,
|
|
|
|
| 617 |
)
|
| 618 |
|
| 619 |
def prepare_inputs_for_generation(
|
|
|
|
| 623 |
inputs_embeds=None,
|
| 624 |
pixel_values=None,
|
| 625 |
image_sizes=None,
|
|
|
|
|
|
|
| 626 |
attention_mask=None,
|
| 627 |
cache_position=None,
|
| 628 |
logits_to_keep=None,
|
|
|
|
| 645 |
# Otherwise we need pixel values to be passed to model
|
| 646 |
model_inputs["pixel_values"] = pixel_values
|
| 647 |
model_inputs["image_sizes"] = image_sizes
|
|
|
|
|
|
|
| 648 |
|
| 649 |
return model_inputs
|
| 650 |
|
|
|
|
| 684 |
|
| 685 |
return causal_mask
|
| 686 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 687 |
|
| 688 |
__all__ = ["RModel", "RForConditionalGeneration", "RPreTrainedModel"]
|
processing_r.py
CHANGED
|
@@ -36,61 +36,49 @@ class RProcessorKwargs(ProcessingKwargs, total=False):
|
|
| 36 |
|
| 37 |
},
|
| 38 |
"image_kwargs": {},
|
| 39 |
-
"videos_kwargs": {},
|
| 40 |
}
|
| 41 |
|
| 42 |
|
| 43 |
class RProcessor(ProcessorMixin):
|
| 44 |
-
attributes = ["image_processor", "tokenizer"
|
| 45 |
valid_kwargs = [
|
| 46 |
"chat_template",
|
| 47 |
"num_image_tokens",
|
| 48 |
"image_processor_type",
|
| 49 |
"vision_feature_select_strategy",
|
| 50 |
"image_token",
|
| 51 |
-
"video_token",
|
| 52 |
"vision_aspect_ratio",
|
| 53 |
]
|
| 54 |
image_processor_class = "AutoImageProcessor"
|
| 55 |
tokenizer_class = "AutoTokenizer"
|
| 56 |
-
video_processor_class = "AutoVideoProcessor"
|
| 57 |
|
| 58 |
def __init__(
|
| 59 |
self,
|
| 60 |
image_processor=None,
|
| 61 |
tokenizer=None,
|
| 62 |
-
video_processor=None,
|
| 63 |
num_image_tokens=None,
|
| 64 |
vision_feature_select_strategy=None,
|
| 65 |
chat_template=None,
|
| 66 |
image_token="<image>",
|
| 67 |
-
video_token="<video>",
|
| 68 |
vision_aspect_ratio= "anyres",
|
| 69 |
**kwargs,
|
| 70 |
):
|
| 71 |
self.num_image_tokens = num_image_tokens
|
| 72 |
self.vision_feature_select_strategy = vision_feature_select_strategy
|
| 73 |
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
| 74 |
-
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
|
| 75 |
self.image_token_id = (
|
| 76 |
tokenizer.image_token_id
|
| 77 |
if getattr(tokenizer, "image_token_id", None)
|
| 78 |
else tokenizer.convert_tokens_to_ids(self.image_token)
|
| 79 |
)
|
| 80 |
-
self.video_token_id = (
|
| 81 |
-
tokenizer.video_token_id
|
| 82 |
-
if getattr(tokenizer, "video_token_id", None)
|
| 83 |
-
else tokenizer.convert_tokens_to_ids(self.video_token)
|
| 84 |
-
)
|
| 85 |
self.vision_aspect_ratio = vision_aspect_ratio
|
| 86 |
-
super().__init__(image_processor, tokenizer,
|
| 87 |
|
| 88 |
def __call__(
|
| 89 |
self,
|
| 90 |
images: ImageInput = None,
|
| 91 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 92 |
audio=None,
|
| 93 |
-
videos=None,
|
| 94 |
**kwargs: Unpack[RProcessorKwargs],
|
| 95 |
) -> BatchFeature:
|
| 96 |
output_kwargs = self._merge_kwargs(
|
|
@@ -104,7 +92,7 @@ class RProcessor(ProcessorMixin):
|
|
| 104 |
elif not isinstance(text, list) and not isinstance(text[0], str):
|
| 105 |
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
| 106 |
|
| 107 |
-
image_inputs =
|
| 108 |
|
| 109 |
if images is not None:
|
| 110 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
|
@@ -119,28 +107,13 @@ class RProcessor(ProcessorMixin):
|
|
| 119 |
text, image_sizes, height, width, self.image_token, batch_num_images
|
| 120 |
)
|
| 121 |
|
| 122 |
-
if videos is not None:
|
| 123 |
-
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
|
| 124 |
-
|
| 125 |
-
one_video = video_inputs.get("pixel_values_videos")[0]
|
| 126 |
-
if isinstance(video_inputs.get("pixel_values_videos")[0], (list, tuple)):
|
| 127 |
-
one_video = np.array(one_video)
|
| 128 |
-
else:
|
| 129 |
-
one_video = to_numpy_array(one_video)
|
| 130 |
-
height, width = get_image_size(one_video[0], channel_dim=output_kwargs["images_kwargs"].get("data_format"))
|
| 131 |
-
num_frames = one_video.shape[0] # frame dim is always after batch dim
|
| 132 |
-
patches_height_width = int(math.sqrt(self.num_image_tokens))
|
| 133 |
-
pooled_height_width = math.ceil(patches_height_width / 2)
|
| 134 |
-
num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
|
| 135 |
-
text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
|
| 136 |
-
|
| 137 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 138 |
|
| 139 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
| 140 |
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
| 141 |
|
| 142 |
|
| 143 |
-
return BatchFeature(data={**text_inputs, **image_inputs
|
| 144 |
|
| 145 |
def _expand_image_tokens(
|
| 146 |
self,
|
|
|
|
| 36 |
|
| 37 |
},
|
| 38 |
"image_kwargs": {},
|
|
|
|
| 39 |
}
|
| 40 |
|
| 41 |
|
| 42 |
class RProcessor(ProcessorMixin):
|
| 43 |
+
attributes = ["image_processor", "tokenizer"]
|
| 44 |
valid_kwargs = [
|
| 45 |
"chat_template",
|
| 46 |
"num_image_tokens",
|
| 47 |
"image_processor_type",
|
| 48 |
"vision_feature_select_strategy",
|
| 49 |
"image_token",
|
|
|
|
| 50 |
"vision_aspect_ratio",
|
| 51 |
]
|
| 52 |
image_processor_class = "AutoImageProcessor"
|
| 53 |
tokenizer_class = "AutoTokenizer"
|
|
|
|
| 54 |
|
| 55 |
def __init__(
|
| 56 |
self,
|
| 57 |
image_processor=None,
|
| 58 |
tokenizer=None,
|
|
|
|
| 59 |
num_image_tokens=None,
|
| 60 |
vision_feature_select_strategy=None,
|
| 61 |
chat_template=None,
|
| 62 |
image_token="<image>",
|
|
|
|
| 63 |
vision_aspect_ratio= "anyres",
|
| 64 |
**kwargs,
|
| 65 |
):
|
| 66 |
self.num_image_tokens = num_image_tokens
|
| 67 |
self.vision_feature_select_strategy = vision_feature_select_strategy
|
| 68 |
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
|
|
|
|
| 69 |
self.image_token_id = (
|
| 70 |
tokenizer.image_token_id
|
| 71 |
if getattr(tokenizer, "image_token_id", None)
|
| 72 |
else tokenizer.convert_tokens_to_ids(self.image_token)
|
| 73 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
self.vision_aspect_ratio = vision_aspect_ratio
|
| 75 |
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
| 76 |
|
| 77 |
def __call__(
|
| 78 |
self,
|
| 79 |
images: ImageInput = None,
|
| 80 |
text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
|
| 81 |
audio=None,
|
|
|
|
| 82 |
**kwargs: Unpack[RProcessorKwargs],
|
| 83 |
) -> BatchFeature:
|
| 84 |
output_kwargs = self._merge_kwargs(
|
|
|
|
| 92 |
elif not isinstance(text, list) and not isinstance(text[0], str):
|
| 93 |
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
| 94 |
|
| 95 |
+
image_inputs = {}
|
| 96 |
|
| 97 |
if images is not None:
|
| 98 |
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
|
|
|
| 107 |
text, image_sizes, height, width, self.image_token, batch_num_images
|
| 108 |
)
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
|
| 111 |
|
| 112 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
| 113 |
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
|
| 114 |
|
| 115 |
|
| 116 |
+
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
|
| 117 |
|
| 118 |
def _expand_image_tokens(
|
| 119 |
self,
|