performance optimization
Browse files- generation_config.json +1 -0
- modeling_ernie_45t_vl.py +115 -28
- processing_ernie_45t_vl.py +7 -298
- tokenization_ernie_45t_vl.py +322 -0
- tokenizer_config.json +1 -1
    	
        generation_config.json
    CHANGED
    
    | @@ -1,6 +1,7 @@ | |
| 1 | 
             
            {
         | 
| 2 | 
             
                "top_p": 0.8,
         | 
| 3 | 
             
                "temperature": 0.2,
         | 
|  | |
| 4 | 
             
                "pad_token_id": 0,
         | 
| 5 | 
             
                "bos_token_id": 1,
         | 
| 6 | 
             
                "eos_token_id": 2,
         | 
|  | |
| 1 | 
             
            {
         | 
| 2 | 
             
                "top_p": 0.8,
         | 
| 3 | 
             
                "temperature": 0.2,
         | 
| 4 | 
            +
                "do_sample": true,
         | 
| 5 | 
             
                "pad_token_id": 0,
         | 
| 6 | 
             
                "bos_token_id": 1,
         | 
| 7 | 
             
                "eos_token_id": 2,
         | 
    	
        modeling_ernie_45t_vl.py
    CHANGED
    
    | @@ -27,6 +27,7 @@ import numpy as np | |
| 27 | 
             
            import torch
         | 
| 28 | 
             
            import torch.nn as nn
         | 
| 29 | 
             
            import torch.nn.functional as F
         | 
|  | |
| 30 |  | 
| 31 | 
             
            from transformers.activations import ACT2FN
         | 
| 32 | 
             
            from transformers.generation import GenerationMixin
         | 
| @@ -321,6 +322,7 @@ class Ernie4_5_Attention(nn.Module): | |
| 321 | 
             
                    self.hidden_size = config.hidden_size
         | 
| 322 | 
             
                    self.num_heads = config.num_attention_heads
         | 
| 323 | 
             
                    self.num_key_value_heads = config.num_key_value_heads
         | 
|  | |
| 324 | 
             
                    self.head_dim = self.hidden_size // self.num_heads
         | 
| 325 | 
             
                    self.is_gqa = (
         | 
| 326 | 
             
                        self.num_key_value_heads is not None
         | 
| @@ -373,7 +375,10 @@ class Ernie4_5_Attention(nn.Module): | |
| 373 | 
             
                        freq_allocation=self.freq_allocation,
         | 
| 374 | 
             
                    )
         | 
| 375 | 
             
                    self.config = config
         | 
| 376 | 
            -
                     | 
|  | |
|  | |
|  | |
| 377 |  | 
| 378 | 
             
                def forward(
         | 
| 379 | 
             
                    self,
         | 
| @@ -446,6 +451,47 @@ class Ernie4_5_Attention(nn.Module): | |
| 446 | 
             
                    )
         | 
| 447 | 
             
                    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
         | 
| 448 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 449 | 
             
                def core_attn(
         | 
| 450 | 
             
                    self,
         | 
| 451 | 
             
                    q,
         | 
| @@ -493,19 +539,13 @@ class Ernie4_5_Attention(nn.Module): | |
| 493 | 
             
                    if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
         | 
| 494 | 
             
                        product = product * getattr(self.config, "scale_qk_coeff", 1.0)
         | 
| 495 |  | 
| 496 | 
            -
                     | 
| 497 | 
            -
             | 
| 498 | 
            -
                         | 
| 499 | 
            -
                         | 
| 500 | 
            -
             | 
| 501 | 
            -
                     | 
| 502 | 
            -
             | 
| 503 | 
            -
                        mask = torch.triu(
         | 
| 504 | 
            -
                            torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
         | 
| 505 | 
            -
                            diagonal=1,
         | 
| 506 | 
            -
                        )
         | 
| 507 | 
            -
                        product = product.masked_fill(mask, float("-inf"))
         | 
| 508 | 
            -
                        weights = F.softmax(product, dim=-1)
         | 
| 509 |  | 
| 510 | 
             
                    weights = weights.to(origin_dtype)
         | 
| 511 |  | 
| @@ -1803,7 +1843,7 @@ class MOEAllGatherLayerV2(MOELayer): | |
| 1803 | 
             
                    enable_reverse_token_drop=False,
         | 
| 1804 | 
             
                    all_to_all_dropout=0,
         | 
| 1805 | 
             
                    group_experts=False,
         | 
| 1806 | 
            -
                    use_expert_out_alltoall=True, | 
| 1807 | 
             
                    use_expert_alltoall_overlap=False,
         | 
| 1808 | 
             
                    use_padding=True,
         | 
| 1809 | 
             
                    dense_token_type=3,  # considerd as dense tokens (no moe)
         | 
| @@ -2729,7 +2769,6 @@ class Ernie4_5_PretrainedModel(PreTrainedModel): | |
| 2729 | 
             
                config_class = Ernie4_5_MoEConfig
         | 
| 2730 | 
             
                base_model_prefix = "ernie"
         | 
| 2731 | 
             
                _no_split_modules = ["Ernie4_5_DecoderLayer"]
         | 
| 2732 | 
            -
                # _keep_in_fp32_modules = ["mlp.gate", "e_score_correction_bias"]
         | 
| 2733 |  | 
| 2734 |  | 
| 2735 | 
             
            class Ernie4_5_Model(Ernie4_5_PretrainedModel):
         | 
| @@ -2876,7 +2915,6 @@ class Ernie4_5_Model(Ernie4_5_PretrainedModel): | |
| 2876 | 
             
                        past_key_value = (
         | 
| 2877 | 
             
                            past_key_values[idx] if past_key_values is not None else None
         | 
| 2878 | 
             
                        )
         | 
| 2879 | 
            -
             | 
| 2880 | 
             
                        layer_outputs = decoder_layer(
         | 
| 2881 | 
             
                            hidden_states,
         | 
| 2882 | 
             
                            attention_mask,
         | 
| @@ -3224,15 +3262,61 @@ class Ernie4_5_MoeForCausalLM(Ernie4_5_PretrainedModel, GenerationMixin): | |
| 3224 | 
             
                    """
         | 
| 3225 | 
             
                    return self.model
         | 
| 3226 |  | 
| 3227 | 
            -
                 | 
| 3228 | 
            -
             | 
| 3229 | 
            -
             | 
| 3230 | 
            -
                     | 
| 3231 | 
            -
             | 
| 3232 | 
            -
             | 
| 3233 | 
            -
             | 
| 3234 | 
            -
                         | 
| 3235 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 3236 |  | 
| 3237 |  | 
| 3238 | 
             
            class VisionMlp(nn.Module):
         | 
| @@ -3943,7 +4027,10 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM): | |
| 3943 | 
             
                            image_type_ids[:, -1:] if image_type_ids is not None else None
         | 
| 3944 | 
             
                        )
         | 
| 3945 |  | 
| 3946 | 
            -
                     | 
|  | |
|  | |
|  | |
| 3947 |  | 
| 3948 | 
             
                    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         | 
| 3949 | 
             
                    if inputs_embeds is not None and past_key_values is None:
         | 
| @@ -4091,7 +4178,7 @@ class Ernie4_5_VLMoeForConditionalGeneration(Ernie4_5_MoeForCausalLM): | |
| 4091 |  | 
| 4092 | 
             
                    outputs = self.model(
         | 
| 4093 | 
             
                        position_ids=position_ids,
         | 
| 4094 | 
            -
                        attention_mask= | 
| 4095 | 
             
                        token_type_ids=token_type_ids,
         | 
| 4096 | 
             
                        inputs_embeds=inputs_embeds,
         | 
| 4097 | 
             
                        use_cache=use_cache,
         | 
|  | |
| 27 | 
             
            import torch
         | 
| 28 | 
             
            import torch.nn as nn
         | 
| 29 | 
             
            import torch.nn.functional as F
         | 
| 30 | 
            +
            from torch.nn.attention import SDPBackend, sdpa_kernel
         | 
| 31 |  | 
| 32 | 
             
            from transformers.activations import ACT2FN
         | 
| 33 | 
             
            from transformers.generation import GenerationMixin
         | 
|  | |
| 322 | 
             
                    self.hidden_size = config.hidden_size
         | 
| 323 | 
             
                    self.num_heads = config.num_attention_heads
         | 
| 324 | 
             
                    self.num_key_value_heads = config.num_key_value_heads
         | 
| 325 | 
            +
                    self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
         | 
| 326 | 
             
                    self.head_dim = self.hidden_size // self.num_heads
         | 
| 327 | 
             
                    self.is_gqa = (
         | 
| 328 | 
             
                        self.num_key_value_heads is not None
         | 
|  | |
| 375 | 
             
                        freq_allocation=self.freq_allocation,
         | 
| 376 | 
             
                    )
         | 
| 377 | 
             
                    self.config = config
         | 
| 378 | 
            +
                    if self.config.use_flash_attention:
         | 
| 379 | 
            +
                        self.attn_func = self._flash_attention_wrapper
         | 
| 380 | 
            +
                    else:
         | 
| 381 | 
            +
                        self.attn_func = self.core_attn
         | 
| 382 |  | 
| 383 | 
             
                def forward(
         | 
| 384 | 
             
                    self,
         | 
|  | |
| 451 | 
             
                    )
         | 
| 452 | 
             
                    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
         | 
| 453 |  | 
| 454 | 
            +
                def _flash_attention_wrapper(
         | 
| 455 | 
            +
                    self,
         | 
| 456 | 
            +
                    q,
         | 
| 457 | 
            +
                    k,
         | 
| 458 | 
            +
                    v,
         | 
| 459 | 
            +
                    attention_mask=None,
         | 
| 460 | 
            +
                    attn_mask_start_row_indices=None,
         | 
| 461 | 
            +
                    seq_length=None,
         | 
| 462 | 
            +
                ):
         | 
| 463 | 
            +
                    """Wrapper for flash attention implementation.
         | 
| 464 | 
            +
                    Args:
         | 
| 465 | 
            +
                        q (torch.Tensor): Query tensor
         | 
| 466 | 
            +
                        k (torch.Tensor): Key tensor
         | 
| 467 | 
            +
                        v (torch.Tensor): Value tensor
         | 
| 468 | 
            +
                        attention_mask (Optional[torch.Tensor]): Attention mask
         | 
| 469 | 
            +
                        attn_mask_start_row_indices (Optional[torch.Tensor]): Variable length indices
         | 
| 470 | 
            +
                        seq_length (Optional[int]): Sequence length
         | 
| 471 | 
            +
                    Returns:
         | 
| 472 | 
            +
                        Tuple[torch.Tensor, torch.Tensor]: Attention output and weights
         | 
| 473 | 
            +
                    """
         | 
| 474 | 
            +
                    q = q.transpose(1, 2)
         | 
| 475 | 
            +
                    k = k.transpose(1, 2)
         | 
| 476 | 
            +
                    v = v.transpose(1, 2)
         | 
| 477 | 
            +
             | 
| 478 | 
            +
                    with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
         | 
| 479 | 
            +
                        out = F.scaled_dot_product_attention(
         | 
| 480 | 
            +
                            q,
         | 
| 481 | 
            +
                            k,
         | 
| 482 | 
            +
                            v,
         | 
| 483 | 
            +
                            attn_mask=None,
         | 
| 484 | 
            +
                            dropout_p=self.config.attention_probs_dropout_prob,
         | 
| 485 | 
            +
                            is_causal=q.shape[-2] == k.shape[-2],
         | 
| 486 | 
            +
                            scale=1
         | 
| 487 | 
            +
                            / (getattr(self.config, "scale_qk_coeff", 1.0) * self.head_dim**0.5),
         | 
| 488 | 
            +
                            enable_gqa=self.is_gqa,
         | 
| 489 | 
            +
                        )
         | 
| 490 | 
            +
                    out = out.transpose(1, 2)
         | 
| 491 | 
            +
                    out = out.contiguous().view(out.size(0), out.size(1), -1)
         | 
| 492 | 
            +
             | 
| 493 | 
            +
                    return out, None
         | 
| 494 | 
            +
             | 
| 495 | 
             
                def core_attn(
         | 
| 496 | 
             
                    self,
         | 
| 497 | 
             
                    q,
         | 
|  | |
| 539 | 
             
                    if getattr(self.config, "scale_qk_coeff", 1.0) != 1.0:
         | 
| 540 | 
             
                        product = product * getattr(self.config, "scale_qk_coeff", 1.0)
         | 
| 541 |  | 
| 542 | 
            +
                    seq_len = product.size(-1)
         | 
| 543 | 
            +
                    mask = torch.triu(
         | 
| 544 | 
            +
                        torch.ones((seq_len, seq_len), dtype=torch.bool, device=product.device),
         | 
| 545 | 
            +
                        diagonal=1,
         | 
| 546 | 
            +
                    )
         | 
| 547 | 
            +
                    product = product.masked_fill(mask, float("-inf"))
         | 
| 548 | 
            +
                    weights = F.softmax(product, dim=-1)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 549 |  | 
| 550 | 
             
                    weights = weights.to(origin_dtype)
         | 
| 551 |  | 
|  | |
| 1843 | 
             
                    enable_reverse_token_drop=False,
         | 
| 1844 | 
             
                    all_to_all_dropout=0,
         | 
| 1845 | 
             
                    group_experts=False,
         | 
| 1846 | 
            +
                    use_expert_out_alltoall=True,
         | 
| 1847 | 
             
                    use_expert_alltoall_overlap=False,
         | 
| 1848 | 
             
                    use_padding=True,
         | 
| 1849 | 
             
                    dense_token_type=3,  # considerd as dense tokens (no moe)
         | 
|  | |
| 2769 | 
             
                config_class = Ernie4_5_MoEConfig
         | 
| 2770 | 
             
                base_model_prefix = "ernie"
         | 
| 2771 | 
             
                _no_split_modules = ["Ernie4_5_DecoderLayer"]
         | 
|  | |
| 2772 |  | 
| 2773 |  | 
| 2774 | 
             
            class Ernie4_5_Model(Ernie4_5_PretrainedModel):
         | 
|  | |
| 2915 | 
             
                        past_key_value = (
         | 
| 2916 | 
             
                            past_key_values[idx] if past_key_values is not None else None
         | 
| 2917 | 
             
                        )
         | 
|  | |
| 2918 | 
             
                        layer_outputs = decoder_layer(
         | 
| 2919 | 
             
                            hidden_states,
         | 
| 2920 | 
             
                            attention_mask,
         | 
|  | |
| 3262 | 
             
                    """
         | 
| 3263 | 
             
                    return self.model
         | 
| 3264 |  | 
| 3265 | 
            +
                # @staticmethod
         | 
| 3266 | 
            +
                def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder=False):
         | 
| 3267 | 
            +
                    """
         | 
| 3268 | 
            +
                    Updates model kwargs for generation.
         | 
| 3269 | 
            +
             | 
| 3270 | 
            +
                    Args:
         | 
| 3271 | 
            +
                        outputs (Any): Model outputs.
         | 
| 3272 | 
            +
                        model_kwargs (dict): Current model kwargs.
         | 
| 3273 | 
            +
                        is_encoder_decoder (bool): Whether using encoder-decoder architecture.
         | 
| 3274 | 
            +
             | 
| 3275 | 
            +
                    Returns:
         | 
| 3276 | 
            +
                        dict: Updated model kwargs.
         | 
| 3277 | 
            +
                    """
         | 
| 3278 | 
            +
                    # update cache
         | 
| 3279 | 
            +
                    if isinstance(outputs, tuple) and len(outputs) > 1 and not isinstance(outputs[1], torch.Tensor):
         | 
| 3280 | 
            +
                        model_kwargs["past_key_values"] = outputs[1]
         | 
| 3281 | 
            +
             | 
| 3282 | 
            +
                    if isinstance(outputs, CausalLMOutputWithCrossAttentions) and "past_key_values" in outputs:
         | 
| 3283 | 
            +
                        model_kwargs["past_key_values"] = outputs.past_key_values
         | 
| 3284 | 
            +
             | 
| 3285 | 
            +
                    # update token_type_ids with last value
         | 
| 3286 | 
            +
                    if "token_type_ids" in model_kwargs and model_kwargs["token_type_ids"] is not None:
         | 
| 3287 | 
            +
                        token_type_ids = model_kwargs["token_type_ids"]
         | 
| 3288 | 
            +
                        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1:]], dim=-1)
         | 
| 3289 | 
            +
             | 
| 3290 | 
            +
                    if not is_encoder_decoder and model_kwargs.get("attention_mask", None) is not None:
         | 
| 3291 | 
            +
                        # update attention mask
         | 
| 3292 | 
            +
                        attention_mask = model_kwargs["attention_mask"]
         | 
| 3293 | 
            +
                        model_kwargs["attention_mask"] = torch.cat(
         | 
| 3294 | 
            +
                            [
         | 
| 3295 | 
            +
                                attention_mask,
         | 
| 3296 | 
            +
                                torch.ones((attention_mask.shape[0], 1), dtype=torch.int64, device=attention_mask.device),
         | 
| 3297 | 
            +
                            ],
         | 
| 3298 | 
            +
                            dim=-1,
         | 
| 3299 | 
            +
                        )
         | 
| 3300 | 
            +
             | 
| 3301 | 
            +
                    # update role_ids
         | 
| 3302 | 
            +
                    if "role_ids" in model_kwargs and model_kwargs["role_ids"] is not None:
         | 
| 3303 | 
            +
                        role_ids = model_kwargs["role_ids"]
         | 
| 3304 | 
            +
                        model_kwargs["role_ids"] = torch.cat([role_ids, role_ids[:, -1:]], dim=-1)
         | 
| 3305 | 
            +
             | 
| 3306 | 
            +
                    if self.config.get('rope_3d', False):
         | 
| 3307 | 
            +
                        assert "position_ids" in model_kwargs, "position_ids must be provided if rope_3d is on"
         | 
| 3308 | 
            +
                        position_ids = model_kwargs["position_ids"]
         | 
| 3309 | 
            +
                        bsz = position_ids.shape[0]
         | 
| 3310 | 
            +
             | 
| 3311 | 
            +
                        max_position = position_ids.max(dim=1, keepdim=True)[0]  # [batch_size, 1, hidden_dim]
         | 
| 3312 | 
            +
                        new_positions = max_position + 1
         | 
| 3313 | 
            +
                        
         | 
| 3314 | 
            +
                        model_kwargs["position_ids"] = torch.cat(
         | 
| 3315 | 
            +
                            [position_ids, new_positions],
         | 
| 3316 | 
            +
                            dim=1
         | 
| 3317 | 
            +
                        )
         | 
| 3318 | 
            +
             | 
| 3319 | 
            +
                    return model_kwargs
         | 
| 3320 |  | 
| 3321 |  | 
| 3322 | 
             
            class VisionMlp(nn.Module):
         | 
|  | |
| 4027 | 
             
                            image_type_ids[:, -1:] if image_type_ids is not None else None
         | 
| 4028 | 
             
                        )
         | 
| 4029 |  | 
| 4030 | 
            +
                    if self.config.use_flash_attention:
         | 
| 4031 | 
            +
                        attention_mask = None
         | 
| 4032 | 
            +
                    else:
         | 
| 4033 | 
            +
                        attention_mask = kwargs.get("attention_mask", None)
         | 
| 4034 |  | 
| 4035 | 
             
                    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         | 
| 4036 | 
             
                    if inputs_embeds is not None and past_key_values is None:
         | 
|  | |
| 4178 |  | 
| 4179 | 
             
                    outputs = self.model(
         | 
| 4180 | 
             
                        position_ids=position_ids,
         | 
| 4181 | 
            +
                        attention_mask=attention_mask,
         | 
| 4182 | 
             
                        token_type_ids=token_type_ids,
         | 
| 4183 | 
             
                        inputs_embeds=inputs_embeds,
         | 
| 4184 | 
             
                        use_cache=use_cache,
         | 
    	
        processing_ernie_45t_vl.py
    CHANGED
    
    | @@ -17,7 +17,6 @@ | |
| 17 | 
             
            import copy
         | 
| 18 | 
             
            import io
         | 
| 19 | 
             
            import os
         | 
| 20 | 
            -
            import re
         | 
| 21 | 
             
            import math
         | 
| 22 | 
             
            import random
         | 
| 23 | 
             
            import requests
         | 
| @@ -27,15 +26,13 @@ import hashlib | |
| 27 | 
             
            import threading
         | 
| 28 | 
             
            import uuid
         | 
| 29 | 
             
            import decord
         | 
| 30 | 
            -
            from  | 
| 31 | 
            -
            from typing import Dict, List, Optional, Tuple, Union
         | 
| 32 |  | 
| 33 | 
             
            import numpy as np
         | 
| 34 | 
             
            import torch
         | 
| 35 | 
             
            from PIL import Image, ImageDraw, ImageFont
         | 
| 36 | 
             
            from PIL.ExifTags import TAGS
         | 
| 37 | 
             
            from collections import defaultdict
         | 
| 38 | 
            -
            from typing import Any, Dict, List, Union
         | 
| 39 | 
             
            from pathlib import Path
         | 
| 40 | 
             
            from tempfile import NamedTemporaryFile as ntf
         | 
| 41 |  | 
| @@ -46,13 +43,8 @@ except: | |
| 46 | 
             
                # moviepy 2.0
         | 
| 47 | 
             
                import moviepy as mp
         | 
| 48 |  | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
            from transformers.tokenization_utils_base import (
         | 
| 52 | 
            -
                PaddingStrategy,
         | 
| 53 | 
            -
                TextInput,
         | 
| 54 | 
            -
            )
         | 
| 55 | 
            -
            from transformers.utils import logging
         | 
| 56 | 
             
            from transformers.utils import TensorType, logging
         | 
| 57 | 
             
            from transformers.video_utils import VideoInput
         | 
| 58 | 
             
            from transformers.processing_utils import ProcessorMixin
         | 
| @@ -618,298 +610,15 @@ class Ernie_45T_VLImageProcessor(BaseImageProcessor): | |
| 618 | 
             
                    return BatchFeature(data=data, tensor_type=return_tensors)
         | 
| 619 |  | 
| 620 |  | 
| 621 | 
            -
            class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
         | 
| 622 | 
            -
                """
         | 
| 623 | 
            -
                Ernie4_5_VLTokenizer
         | 
| 624 | 
            -
                """
         | 
| 625 | 
            -
             | 
| 626 | 
            -
                vocab_files_names = {
         | 
| 627 | 
            -
                    "vocab_file": "tokenizer.model",
         | 
| 628 | 
            -
                }
         | 
| 629 | 
            -
                # Model input names expected by the tokenizer
         | 
| 630 | 
            -
                model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
         | 
| 631 | 
            -
                # Padding side (where to add padding tokens)
         | 
| 632 | 
            -
                padding_side = "right"
         | 
| 633 | 
            -
             | 
| 634 | 
            -
                def __init__(
         | 
| 635 | 
            -
                    self,
         | 
| 636 | 
            -
                    vocab_file,
         | 
| 637 | 
            -
                    bos_token="<s>",
         | 
| 638 | 
            -
                    cls_token="<cls>",
         | 
| 639 | 
            -
                    eos_token="</s>",
         | 
| 640 | 
            -
                    mask_token="<mask:0>",
         | 
| 641 | 
            -
                    pad_token="<pad>",
         | 
| 642 | 
            -
                    sep_token="<sep>",
         | 
| 643 | 
            -
                    unk_token="<unk>",
         | 
| 644 | 
            -
                    additional_special_tokens=None,
         | 
| 645 | 
            -
                    **kwargs,
         | 
| 646 | 
            -
                ):
         | 
| 647 | 
            -
                    """
         | 
| 648 | 
            -
                    Initialize the Ernie4_5_VLTokenizer
         | 
| 649 | 
            -
             | 
| 650 | 
            -
                    Args:
         | 
| 651 | 
            -
                        vocab_file (str): Path to the tokenizer vocabulary model.
         | 
| 652 | 
            -
                        bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
         | 
| 653 | 
            -
                        cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
         | 
| 654 | 
            -
                        eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
         | 
| 655 | 
            -
                        mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
         | 
| 656 | 
            -
                        pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
         | 
| 657 | 
            -
                        sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
         | 
| 658 | 
            -
                        unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
         | 
| 659 | 
            -
                        additional_special_tokens (List[str], optional): Additional special tokens to use.
         | 
| 660 | 
            -
                            Defaults to `["<mask:1>", "<mask:7>"]`.
         | 
| 661 | 
            -
                        **kwargs (dict): Additional keyword arguments passed along to the superclass.
         | 
| 662 | 
            -
                    """
         | 
| 663 | 
            -
             | 
| 664 | 
            -
                    # Store vocabulary file path
         | 
| 665 | 
            -
                    self.vocab_file = vocab_file
         | 
| 666 | 
            -
                    # Initialize SentencePiece processor
         | 
| 667 | 
            -
                    self.sp_model = spm.SentencePieceProcessor()
         | 
| 668 | 
            -
                    # Load the vocabulary model
         | 
| 669 | 
            -
                    self.sp_model.Load(vocab_file)
         | 
| 670 | 
            -
             | 
| 671 | 
            -
                    # Set default additional special tokens if none provided
         | 
| 672 | 
            -
                    if additional_special_tokens is None:
         | 
| 673 | 
            -
                        additional_special_tokens = ["<mask:1>", "<mask:7>"]
         | 
| 674 | 
            -
                    super().__init__(
         | 
| 675 | 
            -
                        bos_token=bos_token,
         | 
| 676 | 
            -
                        cls_token=cls_token,
         | 
| 677 | 
            -
                        eos_token=eos_token,
         | 
| 678 | 
            -
                        mask_token=mask_token,
         | 
| 679 | 
            -
                        pad_token=pad_token,
         | 
| 680 | 
            -
                        sep_token=sep_token,
         | 
| 681 | 
            -
                        unk_token=unk_token,
         | 
| 682 | 
            -
                        additional_special_tokens=additional_special_tokens,
         | 
| 683 | 
            -
                        **kwargs,
         | 
| 684 | 
            -
                    )
         | 
| 685 | 
            -
             | 
| 686 | 
            -
                @property
         | 
| 687 | 
            -
                def space_token(self):
         | 
| 688 | 
            -
                    """Return the space token"""
         | 
| 689 | 
            -
                    return "<mask:1>"
         | 
| 690 | 
            -
             | 
| 691 | 
            -
                @property
         | 
| 692 | 
            -
                def space_token_id(self):
         | 
| 693 | 
            -
                    """Return the ID of the space token"""
         | 
| 694 | 
            -
                    return self.sp_model.piece_to_id("<mask:1>")
         | 
| 695 | 
            -
             | 
| 696 | 
            -
                @property
         | 
| 697 | 
            -
                def gend_token(self):
         | 
| 698 | 
            -
                    """Return the gender token"""
         | 
| 699 | 
            -
                    return "<mask:7>"
         | 
| 700 | 
            -
             | 
| 701 | 
            -
                @property
         | 
| 702 | 
            -
                def gend_token_id(self):
         | 
| 703 | 
            -
                    """Return the ID of the gender token"""
         | 
| 704 | 
            -
                    return self.sp_model.piece_to_id("<mask:7>")
         | 
| 705 | 
            -
             | 
| 706 | 
            -
                @property
         | 
| 707 | 
            -
                def im_start_id(self):
         | 
| 708 | 
            -
                    """Return the ID of the image start token"""
         | 
| 709 | 
            -
                    return self.sp_model.piece_to_id("<|im_start|>")
         | 
| 710 | 
            -
             | 
| 711 | 
            -
                @property
         | 
| 712 | 
            -
                def im_end_id(self):
         | 
| 713 | 
            -
                    """Return the ID of the image end token"""
         | 
| 714 | 
            -
                    return self.sp_model.piece_to_id("<|im_end|>")
         | 
| 715 | 
            -
             | 
| 716 | 
            -
                @property
         | 
| 717 | 
            -
                def vocab_size(self):
         | 
| 718 | 
            -
                    """Return the size of the vocabulary"""
         | 
| 719 | 
            -
                    return self.sp_model.vocab_size()
         | 
| 720 | 
            -
             | 
| 721 | 
            -
                def get_vocab(self):
         | 
| 722 | 
            -
                    """Return the vocabulary as a dictionary mapping tokens to IDs"""
         | 
| 723 | 
            -
                    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         | 
| 724 | 
            -
                    vocab.update(self.added_tokens_encoder)
         | 
| 725 | 
            -
                    return vocab
         | 
| 726 | 
            -
             | 
| 727 | 
            -
                def _tokenize(self, text):
         | 
| 728 | 
            -
                    """Tokenize the input text into pieces"""
         | 
| 729 | 
            -
                    return self.sp_model.encode_as_pieces(text)
         | 
| 730 | 
            -
             | 
| 731 | 
            -
                def _convert_token_to_id(self, token):
         | 
| 732 | 
            -
                    """Convert a token to its corresponding ID"""
         | 
| 733 | 
            -
                    return self.sp_model.piece_to_id(token)
         | 
| 734 | 
            -
             | 
| 735 | 
            -
                def _convert_id_to_token(self, id):
         | 
| 736 | 
            -
                    """Convert an ID to its corresponding token"""
         | 
| 737 | 
            -
                    return self.sp_model.id_to_piece(id)
         | 
| 738 | 
            -
             | 
| 739 | 
            -
                def convert_tokens_to_string(self, tokens):
         | 
| 740 | 
            -
                    """Convert a sequence of tokens back to a string"""
         | 
| 741 | 
            -
                    current_sub_tokens = []
         | 
| 742 | 
            -
                    out_string = ""
         | 
| 743 | 
            -
             | 
| 744 | 
            -
                    for token in tokens:
         | 
| 745 | 
            -
                        # Handle special tokens differently
         | 
| 746 | 
            -
                        if token in self.all_special_tokens:
         | 
| 747 | 
            -
                            out_string += self.sp_model.decode(current_sub_tokens) + token
         | 
| 748 | 
            -
                            current_sub_tokens = []
         | 
| 749 | 
            -
                        else:
         | 
| 750 | 
            -
                            current_sub_tokens.append(token)
         | 
| 751 | 
            -
             | 
| 752 | 
            -
                    # Add any remaining sub-tokens
         | 
| 753 | 
            -
                    out_string += self.sp_model.decode(current_sub_tokens)
         | 
| 754 | 
            -
                    return out_string
         | 
| 755 | 
            -
             | 
| 756 | 
            -
                def prepare_for_model(self, *args, **kwargs):
         | 
| 757 | 
            -
                    """Prepare the tokenized inputs for the model"""
         | 
| 758 | 
            -
                    # Remove add_special_tokens if present (not supported)
         | 
| 759 | 
            -
                    if "add_special_tokens" in kwargs:
         | 
| 760 | 
            -
                        kwargs.pop("add_special_tokens")
         | 
| 761 | 
            -
                    return super().prepare_for_model(*args, **kwargs)
         | 
| 762 | 
            -
             | 
| 763 | 
            -
                def save_vocabulary(
         | 
| 764 | 
            -
                    self, save_directory, filename_prefix: Optional[str] = None
         | 
| 765 | 
            -
                ) -> Tuple[str]:
         | 
| 766 | 
            -
                    """
         | 
| 767 | 
            -
                    Save the vocabulary and special tokens file to a directory.
         | 
| 768 | 
            -
             | 
| 769 | 
            -
                    Args:
         | 
| 770 | 
            -
                        save_directory (`str`): The directory to save the vocabulary to
         | 
| 771 | 
            -
                        filename_prefix (`str`, optional): Prefix to add to the filename
         | 
| 772 | 
            -
             | 
| 773 | 
            -
                    Returns:
         | 
| 774 | 
            -
                        `Tuple(str)`: Paths to the saved files
         | 
| 775 | 
            -
                    """
         | 
| 776 | 
            -
                    if not os.path.isdir(save_directory):
         | 
| 777 | 
            -
                        logger.error(f"Vocabulary path ({save_directory}) should be a directory")
         | 
| 778 | 
            -
                        return
         | 
| 779 | 
            -
             | 
| 780 | 
            -
                    # Construct output vocabulary file path
         | 
| 781 | 
            -
                    out_vocab_file = os.path.join(
         | 
| 782 | 
            -
                        save_directory,
         | 
| 783 | 
            -
                        (filename_prefix + "-" if filename_prefix else "")
         | 
| 784 | 
            -
                        + self.vocab_files_names["vocab_file"],
         | 
| 785 | 
            -
                    )
         | 
| 786 | 
            -
             | 
| 787 | 
            -
                    # Copy or create vocabulary file
         | 
| 788 | 
            -
                    if os.path.abspath(self.vocab_file) != os.path.abspath(
         | 
| 789 | 
            -
                        out_vocab_file
         | 
| 790 | 
            -
                    ) and os.path.isfile(self.vocab_file):
         | 
| 791 | 
            -
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 792 | 
            -
                    elif not os.path.isfile(self.vocab_file):
         | 
| 793 | 
            -
                        with open(out_vocab_file, "wb") as fi:
         | 
| 794 | 
            -
                            content_spiece_model = self.sp_model.serialized_model_proto()
         | 
| 795 | 
            -
                            fi.write(content_spiece_model)
         | 
| 796 | 
            -
             | 
| 797 | 
            -
                    return (out_vocab_file,)
         | 
| 798 | 
            -
             | 
| 799 | 
            -
                def _decode(self, *args, **kwargs):
         | 
| 800 | 
            -
                    """Decode token_id back to text"""
         | 
| 801 | 
            -
                    # Remove some parameters that aren't used
         | 
| 802 | 
            -
                    kwargs.pop("clean_up_tokenization_spaces", None)
         | 
| 803 | 
            -
                    kwargs.pop("spaces_between_special_tokens", None)
         | 
| 804 | 
            -
             | 
| 805 | 
            -
                    # Call parent decode method with specific parameters
         | 
| 806 | 
            -
                    return super()._decode(
         | 
| 807 | 
            -
                        *args,
         | 
| 808 | 
            -
                        **kwargs,
         | 
| 809 | 
            -
                        clean_up_tokenization_spaces=False,
         | 
| 810 | 
            -
                        spaces_between_special_tokens=False,
         | 
| 811 | 
            -
                    )
         | 
| 812 | 
            -
             | 
| 813 | 
            -
                def _pad(
         | 
| 814 | 
            -
                    self,
         | 
| 815 | 
            -
                    encoded_inputs: Dict,
         | 
| 816 | 
            -
                    max_length: Optional[int] = None,
         | 
| 817 | 
            -
                    padding_strategy=PaddingStrategy.DO_NOT_PAD,
         | 
| 818 | 
            -
                    pad_to_multiple_of: Optional[int] = None,
         | 
| 819 | 
            -
                    return_attention_mask: Optional[bool] = None,
         | 
| 820 | 
            -
                ) -> dict:
         | 
| 821 | 
            -
                    """Pad the encoded inputs to the specified length"""
         | 
| 822 | 
            -
                    if return_attention_mask is None:
         | 
| 823 | 
            -
                        return_attention_mask = "attention_mask" in self.model_input_names
         | 
| 824 | 
            -
                    if return_attention_mask:
         | 
| 825 | 
            -
                        required_input = encoded_inputs[self.model_input_names[0]]
         | 
| 826 | 
            -
                        if padding_strategy == PaddingStrategy.LONGEST:
         | 
| 827 | 
            -
                            max_length = len(required_input)
         | 
| 828 | 
            -
             | 
| 829 | 
            -
                        # Adjust max_length if needed for multiple of padding
         | 
| 830 | 
            -
                        if (
         | 
| 831 | 
            -
                            max_length is not None
         | 
| 832 | 
            -
                            and pad_to_multiple_of is not None
         | 
| 833 | 
            -
                            and (max_length % pad_to_multiple_of != 0)
         | 
| 834 | 
            -
                        ):
         | 
| 835 | 
            -
                            max_length = (
         | 
| 836 | 
            -
                                (max_length // pad_to_multiple_of) + 1
         | 
| 837 | 
            -
                            ) * pad_to_multiple_of
         | 
| 838 | 
            -
             | 
| 839 | 
            -
                        # Check if padding is needed
         | 
| 840 | 
            -
                        needs_to_be_padded = (
         | 
| 841 | 
            -
                            padding_strategy != PaddingStrategy.DO_NOT_PAD
         | 
| 842 | 
            -
                            and len(required_input) != max_length
         | 
| 843 | 
            -
                        )
         | 
| 844 | 
            -
             | 
| 845 | 
            -
                        # Handle attention mask if present
         | 
| 846 | 
            -
                        if (
         | 
| 847 | 
            -
                            "attention_mask" in encoded_inputs
         | 
| 848 | 
            -
                            and encoded_inputs["attention_mask"] is not None
         | 
| 849 | 
            -
                        ):
         | 
| 850 | 
            -
                            attention_mask = encoded_inputs.pop("attention_mask")
         | 
| 851 | 
            -
                            if isinstance(attention_mask, torch.Tensor):
         | 
| 852 | 
            -
                                attention_mask = attention_mask.numpy()
         | 
| 853 | 
            -
                            elif isinstance(attention_mask, list):
         | 
| 854 | 
            -
                                attention_mask = np.array(attention_mask)
         | 
| 855 | 
            -
                            elif not isinstance(attention_mask, np.ndarray):
         | 
| 856 | 
            -
                                raise ValueError(
         | 
| 857 | 
            -
                                    f"Unexpected type {type(attention_mask)} of attention_mask, "
         | 
| 858 | 
            -
                                )
         | 
| 859 | 
            -
                        else:
         | 
| 860 | 
            -
                            # Create default attention mask if none provided
         | 
| 861 | 
            -
                            attention_mask = np.tril(
         | 
| 862 | 
            -
                                np.ones((len(required_input), len(required_input)), dtype=np.int64)
         | 
| 863 | 
            -
                            )
         | 
| 864 | 
            -
                            attention_mask = np.expand_dims(attention_mask, axis=0)
         | 
| 865 | 
            -
             | 
| 866 | 
            -
                        # Perform padding if needed
         | 
| 867 | 
            -
                        if needs_to_be_padded:
         | 
| 868 | 
            -
                            difference = max_length - len(required_input)
         | 
| 869 | 
            -
                            if self.padding_side == "right":
         | 
| 870 | 
            -
                                if attention_mask.ndim == 1:
         | 
| 871 | 
            -
                                    pad_width = [(0, difference)]
         | 
| 872 | 
            -
                                else:
         | 
| 873 | 
            -
                                    pad_width = [(0, 0), (0, difference), (0, difference)]
         | 
| 874 | 
            -
                            elif self.padding_side == "left":
         | 
| 875 | 
            -
                                if attention_mask.ndim == 1:
         | 
| 876 | 
            -
                                    pad_width = [(difference, 0)]
         | 
| 877 | 
            -
                                else:
         | 
| 878 | 
            -
                                    pad_width = [(0, 0), (difference, 0), (difference, 0)]
         | 
| 879 | 
            -
                            else:
         | 
| 880 | 
            -
                                raise ValueError(
         | 
| 881 | 
            -
                                    "Invalid padding strategy:" + str(self.padding_side)
         | 
| 882 | 
            -
                                )
         | 
| 883 | 
            -
             | 
| 884 | 
            -
                            attention_mask = np.pad(
         | 
| 885 | 
            -
                                attention_mask,
         | 
| 886 | 
            -
                                pad_width=pad_width,
         | 
| 887 | 
            -
                                mode="constant",
         | 
| 888 | 
            -
                                constant_values=0,
         | 
| 889 | 
            -
                            )
         | 
| 890 | 
            -
             | 
| 891 | 
            -
                    # Call parent padding method
         | 
| 892 | 
            -
                    encoded_inputs = super()._pad(
         | 
| 893 | 
            -
                        encoded_inputs,
         | 
| 894 | 
            -
                        max_length,
         | 
| 895 | 
            -
                        padding_strategy=padding_strategy,
         | 
| 896 | 
            -
                        pad_to_multiple_of=pad_to_multiple_of,
         | 
| 897 | 
            -
                        return_attention_mask=False,
         | 
| 898 | 
            -
                    )
         | 
| 899 | 
            -
             | 
| 900 | 
            -
                    # Add attention mask back if needed
         | 
| 901 | 
            -
                    if return_attention_mask:
         | 
| 902 | 
            -
                        encoded_inputs["attention_mask"] = attention_mask.tolist()
         | 
| 903 | 
            -
             | 
| 904 | 
            -
                    return encoded_inputs
         | 
| 905 | 
            -
             | 
| 906 | 
            -
             | 
| 907 | 
             
            RAW_VIDEO_DIR = "./download_tmp/raw_video/"
         | 
| 908 | 
             
            RAW_IMAGE_DIR = "./download_tmp/raw_images/"
         | 
| 909 | 
             
            EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
         | 
| 910 | 
             
            TMP_DIR = "./download_tmp/upload_tmp/"
         | 
| 911 |  | 
| 912 | 
             
            FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
         | 
|  | |
|  | |
|  | |
| 913 |  | 
| 914 |  | 
| 915 | 
             
            def is_gif(data: bytes) -> bool:
         | 
| @@ -1811,4 +1520,4 @@ class Ernie_45T_VLProcessor(ProcessorMixin): | |
| 1811 | 
             
                    return list(tokenizer_input_names) + list(image_processor_input_names)
         | 
| 1812 |  | 
| 1813 |  | 
| 1814 | 
            -
            __all__ = ["Ernie_45T_VLImageProcessor", " | 
|  | |
| 17 | 
             
            import copy
         | 
| 18 | 
             
            import io
         | 
| 19 | 
             
            import os
         | 
|  | |
| 20 | 
             
            import math
         | 
| 21 | 
             
            import random
         | 
| 22 | 
             
            import requests
         | 
|  | |
| 26 | 
             
            import threading
         | 
| 27 | 
             
            import uuid
         | 
| 28 | 
             
            import decord
         | 
| 29 | 
            +
            from typing import Any, Dict, List, Optional, Tuple, Union
         | 
|  | |
| 30 |  | 
| 31 | 
             
            import numpy as np
         | 
| 32 | 
             
            import torch
         | 
| 33 | 
             
            from PIL import Image, ImageDraw, ImageFont
         | 
| 34 | 
             
            from PIL.ExifTags import TAGS
         | 
| 35 | 
             
            from collections import defaultdict
         | 
|  | |
| 36 | 
             
            from pathlib import Path
         | 
| 37 | 
             
            from tempfile import NamedTemporaryFile as ntf
         | 
| 38 |  | 
|  | |
| 43 | 
             
                # moviepy 2.0
         | 
| 44 | 
             
                import moviepy as mp
         | 
| 45 |  | 
| 46 | 
            +
            from .tokenization_ernie_45t_vl import Ernie4_5_VLTokenizer
         | 
| 47 | 
            +
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 48 | 
             
            from transformers.utils import TensorType, logging
         | 
| 49 | 
             
            from transformers.video_utils import VideoInput
         | 
| 50 | 
             
            from transformers.processing_utils import ProcessorMixin
         | 
|  | |
| 610 | 
             
                    return BatchFeature(data=data, tensor_type=return_tensors)
         | 
| 611 |  | 
| 612 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 613 | 
             
            RAW_VIDEO_DIR = "./download_tmp/raw_video/"
         | 
| 614 | 
             
            RAW_IMAGE_DIR = "./download_tmp/raw_images/"
         | 
| 615 | 
             
            EXTRACTED_FRAME_DIR = "./download_tmp/extracted_frames/"
         | 
| 616 | 
             
            TMP_DIR = "./download_tmp/upload_tmp/"
         | 
| 617 |  | 
| 618 | 
             
            FONT_PATH = os.path.join(Path(__file__).parent.absolute(), "Roboto-Regular.ttf")
         | 
| 619 | 
            +
            if not os.path.exists(FONT_PATH):
         | 
| 620 | 
            +
                ttf = requests.get("https://paddlenlp.bj.bcebos.com/vision-language-models/materials/Roboto-Regular.ttf")
         | 
| 621 | 
            +
                open(FONT_PATH, "wb").write(ttf.content)
         | 
| 622 |  | 
| 623 |  | 
| 624 | 
             
            def is_gif(data: bytes) -> bool:
         | 
|  | |
| 1520 | 
             
                    return list(tokenizer_input_names) + list(image_processor_input_names)
         | 
| 1521 |  | 
| 1522 |  | 
| 1523 | 
            +
            __all__ = ["Ernie_45T_VLImageProcessor", "Ernie_45T_VLProcessor"]
         | 
    	
        tokenization_ernie_45t_vl.py
    ADDED
    
    | @@ -0,0 +1,322 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
         | 
| 2 | 
            +
            #
         | 
| 3 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 4 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 5 | 
            +
            # You may obtain a copy of the License at
         | 
| 6 | 
            +
            #
         | 
| 7 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 8 | 
            +
            #
         | 
| 9 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 10 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 11 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 12 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 13 | 
            +
            # limitations under the License.
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            """Tokenization classes for Ernie_45T_VL."""
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            import os
         | 
| 18 | 
            +
            from shutil import copyfile
         | 
| 19 | 
            +
            from typing import Dict, List, Optional, Tuple
         | 
| 20 | 
            +
            import numpy as np
         | 
| 21 | 
            +
            import torch
         | 
| 22 | 
            +
            import sentencepiece as spm
         | 
| 23 | 
            +
            from transformers.tokenization_utils import PreTrainedTokenizer
         | 
| 24 | 
            +
            from transformers.tokenization_utils_base import (
         | 
| 25 | 
            +
                PaddingStrategy,
         | 
| 26 | 
            +
                TextInput,
         | 
| 27 | 
            +
            )
         | 
| 28 | 
            +
            from transformers.utils import logging
         | 
| 29 | 
            +
             | 
| 30 | 
            +
             | 
| 31 | 
            +
            logger = logging.get_logger(__name__)
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            class Ernie4_5_VLTokenizer(PreTrainedTokenizer):
         | 
| 36 | 
            +
                """
         | 
| 37 | 
            +
                Ernie4_5_VLTokenizer
         | 
| 38 | 
            +
                """
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                vocab_files_names = {
         | 
| 41 | 
            +
                    "vocab_file": "tokenizer.model",
         | 
| 42 | 
            +
                }
         | 
| 43 | 
            +
                # Model input names expected by the tokenizer
         | 
| 44 | 
            +
                model_input_names = ["input_ids", "position_ids", "attention_mask", "labels"]
         | 
| 45 | 
            +
                # Padding side (where to add padding tokens)
         | 
| 46 | 
            +
                padding_side = "right"
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                def __init__(
         | 
| 49 | 
            +
                    self,
         | 
| 50 | 
            +
                    vocab_file,
         | 
| 51 | 
            +
                    bos_token="<s>",
         | 
| 52 | 
            +
                    cls_token="<cls>",
         | 
| 53 | 
            +
                    eos_token="</s>",
         | 
| 54 | 
            +
                    mask_token="<mask:0>",
         | 
| 55 | 
            +
                    pad_token="<pad>",
         | 
| 56 | 
            +
                    sep_token="<sep>",
         | 
| 57 | 
            +
                    unk_token="<unk>",
         | 
| 58 | 
            +
                    additional_special_tokens=None,
         | 
| 59 | 
            +
                    **kwargs,
         | 
| 60 | 
            +
                ):
         | 
| 61 | 
            +
                    """
         | 
| 62 | 
            +
                    Initialize the Ernie4_5_VLTokenizer
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    Args:
         | 
| 65 | 
            +
                        vocab_file (str): Path to the tokenizer vocabulary model.
         | 
| 66 | 
            +
                        bos_token (str, optional): The beginning of sequence token. Defaults to `"<s>"`.
         | 
| 67 | 
            +
                        cls_token (str, optional): The classifier token. Defaults to `"<cls>"`.
         | 
| 68 | 
            +
                        eos_token (str, optional): The end of sequence token. Defaults to `"</s>"`.
         | 
| 69 | 
            +
                        mask_token (str, optional): The masking token. Defaults to `"<mask:0>"`.
         | 
| 70 | 
            +
                        pad_token (str, optional): The padding token. Defaults to `"<pad>"`.
         | 
| 71 | 
            +
                        sep_token (str, optional): The separation token. Defaults to `"<sep>"`.
         | 
| 72 | 
            +
                        unk_token (str, optional): The unknown tokens symbol. Defaults to `"<unk>"`.
         | 
| 73 | 
            +
                        additional_special_tokens (List[str], optional): Additional special tokens to use.
         | 
| 74 | 
            +
                            Defaults to `["<mask:1>", "<mask:7>"]`.
         | 
| 75 | 
            +
                        **kwargs (dict): Additional keyword arguments passed along to the superclass.
         | 
| 76 | 
            +
                    """
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    # Store vocabulary file path
         | 
| 79 | 
            +
                    self.vocab_file = vocab_file
         | 
| 80 | 
            +
                    # Initialize SentencePiece processor
         | 
| 81 | 
            +
                    self.sp_model = spm.SentencePieceProcessor()
         | 
| 82 | 
            +
                    # Load the vocabulary model
         | 
| 83 | 
            +
                    self.sp_model.Load(vocab_file)
         | 
| 84 | 
            +
             | 
| 85 | 
            +
                    # Set default additional special tokens if none provided
         | 
| 86 | 
            +
                    if additional_special_tokens is None:
         | 
| 87 | 
            +
                        additional_special_tokens = ["<mask:1>", "<mask:7>"]
         | 
| 88 | 
            +
                    super().__init__(
         | 
| 89 | 
            +
                        bos_token=bos_token,
         | 
| 90 | 
            +
                        cls_token=cls_token,
         | 
| 91 | 
            +
                        eos_token=eos_token,
         | 
| 92 | 
            +
                        mask_token=mask_token,
         | 
| 93 | 
            +
                        pad_token=pad_token,
         | 
| 94 | 
            +
                        sep_token=sep_token,
         | 
| 95 | 
            +
                        unk_token=unk_token,
         | 
| 96 | 
            +
                        additional_special_tokens=additional_special_tokens,
         | 
| 97 | 
            +
                        **kwargs,
         | 
| 98 | 
            +
                    )
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                @property
         | 
| 101 | 
            +
                def space_token(self):
         | 
| 102 | 
            +
                    """Return the space token"""
         | 
| 103 | 
            +
                    return "<mask:1>"
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                @property
         | 
| 106 | 
            +
                def space_token_id(self):
         | 
| 107 | 
            +
                    """Return the ID of the space token"""
         | 
| 108 | 
            +
                    return self.sp_model.piece_to_id("<mask:1>")
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                @property
         | 
| 111 | 
            +
                def gend_token(self):
         | 
| 112 | 
            +
                    """Return the gender token"""
         | 
| 113 | 
            +
                    return "<mask:7>"
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                @property
         | 
| 116 | 
            +
                def gend_token_id(self):
         | 
| 117 | 
            +
                    """Return the ID of the gender token"""
         | 
| 118 | 
            +
                    return self.sp_model.piece_to_id("<mask:7>")
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                @property
         | 
| 121 | 
            +
                def im_start_id(self):
         | 
| 122 | 
            +
                    """Return the ID of the image start token"""
         | 
| 123 | 
            +
                    return self.sp_model.piece_to_id("<|im_start|>")
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                @property
         | 
| 126 | 
            +
                def im_end_id(self):
         | 
| 127 | 
            +
                    """Return the ID of the image end token"""
         | 
| 128 | 
            +
                    return self.sp_model.piece_to_id("<|im_end|>")
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                @property
         | 
| 131 | 
            +
                def vocab_size(self):
         | 
| 132 | 
            +
                    """Return the size of the vocabulary"""
         | 
| 133 | 
            +
                    return self.sp_model.vocab_size()
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                def get_vocab(self):
         | 
| 136 | 
            +
                    """Return the vocabulary as a dictionary mapping tokens to IDs"""
         | 
| 137 | 
            +
                    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         | 
| 138 | 
            +
                    vocab.update(self.added_tokens_encoder)
         | 
| 139 | 
            +
                    return vocab
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                def _tokenize(self, text):
         | 
| 142 | 
            +
                    """Tokenize the input text into pieces"""
         | 
| 143 | 
            +
                    return self.sp_model.encode_as_pieces(text)
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                def _convert_token_to_id(self, token):
         | 
| 146 | 
            +
                    """Convert a token to its corresponding ID"""
         | 
| 147 | 
            +
                    return self.sp_model.piece_to_id(token)
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                def _convert_id_to_token(self, id):
         | 
| 150 | 
            +
                    """Convert an ID to its corresponding token"""
         | 
| 151 | 
            +
                    return self.sp_model.id_to_piece(id)
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                def convert_tokens_to_string(self, tokens):
         | 
| 154 | 
            +
                    """Convert a sequence of tokens back to a string"""
         | 
| 155 | 
            +
                    current_sub_tokens = []
         | 
| 156 | 
            +
                    out_string = ""
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                    for token in tokens:
         | 
| 159 | 
            +
                        # Handle special tokens differently
         | 
| 160 | 
            +
                        if token in self.all_special_tokens:
         | 
| 161 | 
            +
                            out_string += self.sp_model.decode(current_sub_tokens) + token
         | 
| 162 | 
            +
                            current_sub_tokens = []
         | 
| 163 | 
            +
                        else:
         | 
| 164 | 
            +
                            current_sub_tokens.append(token)
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    # Add any remaining sub-tokens
         | 
| 167 | 
            +
                    out_string += self.sp_model.decode(current_sub_tokens)
         | 
| 168 | 
            +
                    return out_string
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                def prepare_for_model(self, *args, **kwargs):
         | 
| 171 | 
            +
                    """Prepare the tokenized inputs for the model"""
         | 
| 172 | 
            +
                    # Remove add_special_tokens if present (not supported)
         | 
| 173 | 
            +
                    if "add_special_tokens" in kwargs:
         | 
| 174 | 
            +
                        kwargs.pop("add_special_tokens")
         | 
| 175 | 
            +
                    return super().prepare_for_model(*args, **kwargs)
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                def save_vocabulary(
         | 
| 178 | 
            +
                    self, save_directory, filename_prefix: Optional[str] = None
         | 
| 179 | 
            +
                ) -> Tuple[str]:
         | 
| 180 | 
            +
                    """
         | 
| 181 | 
            +
                    Save the vocabulary and special tokens file to a directory.
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                    Args:
         | 
| 184 | 
            +
                        save_directory (`str`): The directory to save the vocabulary to
         | 
| 185 | 
            +
                        filename_prefix (`str`, optional): Prefix to add to the filename
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    Returns:
         | 
| 188 | 
            +
                        `Tuple(str)`: Paths to the saved files
         | 
| 189 | 
            +
                    """
         | 
| 190 | 
            +
                    if not os.path.isdir(save_directory):
         | 
| 191 | 
            +
                        logger.error(f"Vocabulary path ({save_directory}) should be a directory")
         | 
| 192 | 
            +
                        return
         | 
| 193 | 
            +
             | 
| 194 | 
            +
                    # Construct output vocabulary file path
         | 
| 195 | 
            +
                    out_vocab_file = os.path.join(
         | 
| 196 | 
            +
                        save_directory,
         | 
| 197 | 
            +
                        (filename_prefix + "-" if filename_prefix else "")
         | 
| 198 | 
            +
                        + self.vocab_files_names["vocab_file"],
         | 
| 199 | 
            +
                    )
         | 
| 200 | 
            +
             | 
| 201 | 
            +
                    # Copy or create vocabulary file
         | 
| 202 | 
            +
                    if os.path.abspath(self.vocab_file) != os.path.abspath(
         | 
| 203 | 
            +
                        out_vocab_file
         | 
| 204 | 
            +
                    ) and os.path.isfile(self.vocab_file):
         | 
| 205 | 
            +
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 206 | 
            +
                    elif not os.path.isfile(self.vocab_file):
         | 
| 207 | 
            +
                        with open(out_vocab_file, "wb") as fi:
         | 
| 208 | 
            +
                            content_spiece_model = self.sp_model.serialized_model_proto()
         | 
| 209 | 
            +
                            fi.write(content_spiece_model)
         | 
| 210 | 
            +
             | 
| 211 | 
            +
                    return (out_vocab_file,)
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                def _decode(self, *args, **kwargs):
         | 
| 214 | 
            +
                    """Decode token_id back to text"""
         | 
| 215 | 
            +
                    # Remove some parameters that aren't used
         | 
| 216 | 
            +
                    kwargs.pop("clean_up_tokenization_spaces", None)
         | 
| 217 | 
            +
                    kwargs.pop("spaces_between_special_tokens", None)
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                    # Call parent decode method with specific parameters
         | 
| 220 | 
            +
                    return super()._decode(
         | 
| 221 | 
            +
                        *args,
         | 
| 222 | 
            +
                        **kwargs,
         | 
| 223 | 
            +
                        clean_up_tokenization_spaces=False,
         | 
| 224 | 
            +
                        spaces_between_special_tokens=False,
         | 
| 225 | 
            +
                    )
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                def _pad(
         | 
| 228 | 
            +
                    self,
         | 
| 229 | 
            +
                    encoded_inputs: Dict,
         | 
| 230 | 
            +
                    max_length: Optional[int] = None,
         | 
| 231 | 
            +
                    padding_strategy=PaddingStrategy.DO_NOT_PAD,
         | 
| 232 | 
            +
                    pad_to_multiple_of: Optional[int] = None,
         | 
| 233 | 
            +
                    return_attention_mask: Optional[bool] = None,
         | 
| 234 | 
            +
                ) -> dict:
         | 
| 235 | 
            +
                    """Pad the encoded inputs to the specified length"""
         | 
| 236 | 
            +
                    if return_attention_mask is None:
         | 
| 237 | 
            +
                        return_attention_mask = "attention_mask" in self.model_input_names
         | 
| 238 | 
            +
                    if return_attention_mask:
         | 
| 239 | 
            +
                        required_input = encoded_inputs[self.model_input_names[0]]
         | 
| 240 | 
            +
                        if padding_strategy == PaddingStrategy.LONGEST:
         | 
| 241 | 
            +
                            max_length = len(required_input)
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                        # Adjust max_length if needed for multiple of padding
         | 
| 244 | 
            +
                        if (
         | 
| 245 | 
            +
                            max_length is not None
         | 
| 246 | 
            +
                            and pad_to_multiple_of is not None
         | 
| 247 | 
            +
                            and (max_length % pad_to_multiple_of != 0)
         | 
| 248 | 
            +
                        ):
         | 
| 249 | 
            +
                            max_length = (
         | 
| 250 | 
            +
                                (max_length // pad_to_multiple_of) + 1
         | 
| 251 | 
            +
                            ) * pad_to_multiple_of
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                        # Check if padding is needed
         | 
| 254 | 
            +
                        needs_to_be_padded = (
         | 
| 255 | 
            +
                            padding_strategy != PaddingStrategy.DO_NOT_PAD
         | 
| 256 | 
            +
                            and len(required_input) != max_length
         | 
| 257 | 
            +
                        )
         | 
| 258 | 
            +
             | 
| 259 | 
            +
                        # Handle attention mask if present
         | 
| 260 | 
            +
                        if (
         | 
| 261 | 
            +
                            "attention_mask" in encoded_inputs
         | 
| 262 | 
            +
                            and encoded_inputs["attention_mask"] is not None
         | 
| 263 | 
            +
                        ):
         | 
| 264 | 
            +
                            attention_mask = encoded_inputs.pop("attention_mask")
         | 
| 265 | 
            +
                            if isinstance(attention_mask, torch.Tensor):
         | 
| 266 | 
            +
                                attention_mask = attention_mask.numpy()
         | 
| 267 | 
            +
                            elif isinstance(attention_mask, list):
         | 
| 268 | 
            +
                                attention_mask = np.array(attention_mask)
         | 
| 269 | 
            +
                            elif not isinstance(attention_mask, np.ndarray):
         | 
| 270 | 
            +
                                raise ValueError(
         | 
| 271 | 
            +
                                    f"Unexpected type {type(attention_mask)} of attention_mask, "
         | 
| 272 | 
            +
                                )
         | 
| 273 | 
            +
                        else:
         | 
| 274 | 
            +
                            # Create default attention mask if none provided
         | 
| 275 | 
            +
                            attention_mask = np.tril(
         | 
| 276 | 
            +
                                np.ones((len(required_input), len(required_input)), dtype=np.int64)
         | 
| 277 | 
            +
                            )
         | 
| 278 | 
            +
                            attention_mask = np.expand_dims(attention_mask, axis=0)
         | 
| 279 | 
            +
             | 
| 280 | 
            +
                        # Perform padding if needed
         | 
| 281 | 
            +
                        if needs_to_be_padded:
         | 
| 282 | 
            +
                            difference = max_length - len(required_input)
         | 
| 283 | 
            +
                            if self.padding_side == "right":
         | 
| 284 | 
            +
                                if attention_mask.ndim == 1:
         | 
| 285 | 
            +
                                    pad_width = [(0, difference)]
         | 
| 286 | 
            +
                                else:
         | 
| 287 | 
            +
                                    pad_width = [(0, 0), (0, difference), (0, difference)]
         | 
| 288 | 
            +
                            elif self.padding_side == "left":
         | 
| 289 | 
            +
                                if attention_mask.ndim == 1:
         | 
| 290 | 
            +
                                    pad_width = [(difference, 0)]
         | 
| 291 | 
            +
                                else:
         | 
| 292 | 
            +
                                    pad_width = [(0, 0), (difference, 0), (difference, 0)]
         | 
| 293 | 
            +
                            else:
         | 
| 294 | 
            +
                                raise ValueError(
         | 
| 295 | 
            +
                                    "Invalid padding strategy:" + str(self.padding_side)
         | 
| 296 | 
            +
                                )
         | 
| 297 | 
            +
             | 
| 298 | 
            +
                            attention_mask = np.pad(
         | 
| 299 | 
            +
                                attention_mask,
         | 
| 300 | 
            +
                                pad_width=pad_width,
         | 
| 301 | 
            +
                                mode="constant",
         | 
| 302 | 
            +
                                constant_values=0,
         | 
| 303 | 
            +
                            )
         | 
| 304 | 
            +
             | 
| 305 | 
            +
                    # Call parent padding method
         | 
| 306 | 
            +
                    encoded_inputs = super()._pad(
         | 
| 307 | 
            +
                        encoded_inputs,
         | 
| 308 | 
            +
                        max_length,
         | 
| 309 | 
            +
                        padding_strategy=padding_strategy,
         | 
| 310 | 
            +
                        pad_to_multiple_of=pad_to_multiple_of,
         | 
| 311 | 
            +
                        return_attention_mask=False,
         | 
| 312 | 
            +
                    )
         | 
| 313 | 
            +
             | 
| 314 | 
            +
                    # Add attention mask back if needed
         | 
| 315 | 
            +
                    if return_attention_mask:
         | 
| 316 | 
            +
                        encoded_inputs["attention_mask"] = attention_mask.tolist()
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                    return encoded_inputs
         | 
| 319 | 
            +
             | 
| 320 | 
            +
             | 
| 321 | 
            +
            __all__ = ["Ernie4_5_VLTokenizer"]
         | 
| 322 | 
            +
             | 
    	
        tokenizer_config.json
    CHANGED
    
    | @@ -14,7 +14,7 @@ | |
| 14 | 
             
                "tokenizer_class": "Ernie4_5_VLTokenizer",
         | 
| 15 | 
             
                "auto_map": {
         | 
| 16 | 
             
                    "AutoTokenizer": [
         | 
| 17 | 
            -
                        " | 
| 18 | 
             
                        null
         | 
| 19 | 
             
                    ]
         | 
| 20 | 
             
                },
         | 
|  | |
| 14 | 
             
                "tokenizer_class": "Ernie4_5_VLTokenizer",
         | 
| 15 | 
             
                "auto_map": {
         | 
| 16 | 
             
                    "AutoTokenizer": [
         | 
| 17 | 
            +
                        "tokenization_ernie_45t_vl.Ernie4_5_VLTokenizer",
         | 
| 18 | 
             
                        null
         | 
| 19 | 
             
                    ]
         | 
| 20 | 
             
                },
         | 

