etri-vilab
/

SafeLLaVA-13B

@@ -1,618 +0,0 @@
-from typing import List, Optional, Tuple, Union, Dict
-import torch
-import torch.nn as nn
-from transformers import AutoConfig, AutoModelForCausalLM
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from llava.model.language_model.llava_llama import (
-    LlavaConfig, LlavaLlamaModel, LlavaLlamaForCausalLM
-)
-from llava.constants import IMAGE_TOKEN_INDEX
-from dataclasses import dataclass
-import logging
-from llava.utils import setup_simple_logging
-setup_simple_logging()
-@dataclass
-class SafetyCausalLMOutputWithPast(CausalLMOutputWithPast):
-    """
-    Base class for causal language model (or autoregressive) outputs with safety predictions.
-    """
-    img_safety_logits: Optional[torch.FloatTensor] = None
-    img_safety_probs: Optional[torch.FloatTensor] = None
-    txt_safety_logits: Optional[torch.FloatTensor] = None
-    txt_safety_probs: Optional[torch.FloatTensor] = None
-    total_safety_logits: Optional[torch.FloatTensor] = None
-    total_safety_probs: Optional[torch.FloatTensor] = None
-class SafetyMLP(nn.Module):
-    """
-    Safety classification head implemented as Multi-layer Perceptron.
-    """
-    def __init__(self, input_size: int, hidden_size: int, output_size: int,
-                 safety_num_hidden_layers: int = 1):
-        super().__init__()
-        layers = []
-        layers.append(nn.Linear(input_size, hidden_size))
-        layers.append(nn.GELU())
-        for _ in range(safety_num_hidden_layers - 1):
-            layers.append(nn.Linear(hidden_size, hidden_size))
-            layers.append(nn.GELU())
-        layers.append(nn.Linear(hidden_size, output_size))
-        self.mlp = nn.Sequential(*layers)
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.mlp(x)
-class SafetyConfig(LlavaConfig):
-    """Safety-aware configuration for pooling version without meta tokens"""
-    model_type = "safe_llava_llama_pool"
-    def __init__(
-        self,
-        safety_categories=None,
-        safety_num_hidden_layers=1,
-        unfreeze_mm_vision_tower=True,
-        delay_load_vision_tower=False,
-        safety_head_hidden_scale=4.0,
-        pooling_method="mean",  # mean, max, or cls
-        attention_dropout=0.0,  # Add missing attribute for compatibility
-        **kwargs
-    ):
-        # Ensure attention_dropout is in kwargs if not provided
-        if 'attention_dropout' not in kwargs:
-            kwargs['attention_dropout'] = attention_dropout
-        super().__init__(**kwargs)
-        # Default safety categories if not provided (from original SafeLLaVA)
-        self.safety_categories = safety_categories or [
-            "safe",
-            "gender",
-            "race",
-            "religion",
-            "harassment",
-            "disability_discrimination",
-            "drug_crime",
-            "property_crime",
-            "facial_data",
-            "identity_data",
-            "physical_self_injury",
-            "suicide",
-            "animal_abuse",
-            "obscene_gestures",
-            "physical_altercation",
-            "terrorism",
-            "weapon_related_violence",
-            "sexual_content",
-            "financial_advice",
-            "medical_advice"
-        ]
-        self.safety_num_hidden_layers = safety_num_hidden_layers
-        self.unfreeze_mm_vision_tower = unfreeze_mm_vision_tower
-        self.delay_load_vision_tower = delay_load_vision_tower
-        self.safety_head_hidden_scale = safety_head_hidden_scale
-        self.pooling_method = pooling_method
-        # Pool version doesn't use meta tokens
-        self.use_img_safety_meta_token = False
-        self.use_txt_safety_meta_token = False
-        self.use_total_safety_meta_token = False
-class SafeLlavaLlamaForCausalLM(LlavaLlamaForCausalLM):
-    """
-    SafeLLaVA-Pool: A simplified version without meta tokens.
-    Pools visual tokens directly for safety classification.
-    """
-    config_class = SafetyConfig
-    def __init__(self, config: SafetyConfig):
-        super().__init__(config)
-        # Safety head for image classification (using pooled visual tokens)
-        self.img_safety_head = SafetyMLP(
-            input_size=config.hidden_size,
-            hidden_size=int(config.hidden_size * config.safety_head_hidden_scale),
-            output_size=len(config.safety_categories),
-            safety_num_hidden_layers=config.safety_num_hidden_layers
-        )
-        logging.info("Created img_safety_head for SafeLLaVA-Pool")
-        # Store pooling method
-        self.pooling_method = config.pooling_method
-        # Safety warning template
-        self.safety_warning_template = (
-            "I apologize, but I cannot provide a response as the content appears to be {category}. "
-            "I aim to maintain ethical and safe interactions. "
-            "Please feel free to ask about other topics that do not involve potentially harmful or inappropriate content."
-        )
-    def get_model(self):
-        return self.model
-    def get_safety_warning(self, unsafe_categories):
-        if len(unsafe_categories) == 1:
-            category_str = f"related to {unsafe_categories[0]}"
-        else:
-            category_str = "related to " + ", ".join(unsafe_categories[:-1]) + f" and {unsafe_categories[-1]}"
-        return self.safety_warning_template.format(category=category_str)
-    def pool_visual_tokens(self, hidden_states, input_ids, images):
-        """
-        Pool visual tokens from hidden states.
-        Args:
-            hidden_states: Last layer hidden states [batch_size, seq_len, hidden_size]
-            input_ids: Original input token IDs to locate image positions
-            images: Input images tensor
-        Returns:
-            Pooled visual features [batch_size, hidden_size]
-        """
-        batch_size = hidden_states.shape[0]
-        device = hidden_states.device
-        # If no images, return zeros
-        if images is None:
-            return torch.zeros(batch_size, hidden_states.shape[-1], device=device)
-        # Get the number of visual patches
-        vision_tower = self.get_vision_tower()
-        if vision_tower is not None and hasattr(vision_tower, 'config'):
-            # Calculate based on vision config
-            image_size = vision_tower.config.image_size
-            patch_size = vision_tower.config.patch_size
-            num_patches = (image_size // patch_size) ** 2
-        else:
-            num_patches = 576  # Default for CLIP ViT-L/14-336px
-        pooled_features = []
-        for batch_idx in range(batch_size):
-            try:
-                # Find where IMAGE_TOKEN_INDEX was in the original input
-                if input_ids is not None and batch_idx < input_ids.shape[0]:
-                    image_positions = torch.where(input_ids[batch_idx] == IMAGE_TOKEN_INDEX)[0]
-                    if len(image_positions) > 0:
-                        # Visual tokens replace the IMAGE_TOKEN_INDEX
-                        # The actual visual tokens start at this position
-                        start_pos = image_positions[0].item()
-                        end_pos = min(start_pos + num_patches, hidden_states.shape[1])
-                        if end_pos > start_pos and (end_pos - start_pos) > 0:
-                            visual_embeddings = hidden_states[batch_idx, start_pos:end_pos]
-                            # Apply pooling
-                            if visual_embeddings.shape[0] > 0:
-                                if self.pooling_method == "mean":
-                                    pooled = visual_embeddings.mean(dim=0)
-                                elif self.pooling_method == "max":
-                                    pooled = visual_embeddings.max(dim=0)[0]
-                                elif self.pooling_method == "cls":
-                                    # Use the first visual token
-                                    pooled = visual_embeddings[0]
-                                else:
-                                    pooled = visual_embeddings.mean(dim=0)  # Default to mean
-                                pooled_features.append(pooled)
-                            else:
-                                # Empty visual embeddings
-                                pooled_features.append(torch.zeros(hidden_states.shape[-1], device=device))
-                        else:
-                            # Invalid range
-                            pooled_features.append(torch.zeros(hidden_states.shape[-1], device=device))
-                    else:
-                        # No image token found, might be text-only sample
-                        pooled_features.append(torch.zeros(hidden_states.shape[-1], device=device))
-                else:
-                    # No input_ids available
-                    pooled_features.append(torch.zeros(hidden_states.shape[-1], device=device))
-            except Exception as e:
-                logging.warning(f"Error pooling visual tokens for batch {batch_idx}: {str(e)}")
-                # Return zero vector on error
-                pooled_features.append(torch.zeros(hidden_states.shape[-1], device=device))
-        # Stack all pooled features
-        pooled_features = torch.stack(pooled_features, dim=0)
-        return pooled_features
-    def compute_gradcam(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        images=None,
-        image_sizes=None,
-        target_class=None,
-        use_pre_pooling=False,
-        **kwargs,
-    ):
-        """
-        Compute Grad-CAM for the image safety classification.
-        Args:
-            input_ids: Input token IDs
-            attention_mask: Attention mask
-            images: Input images tensor [batch_size, 3, H, W]
-            image_sizes: Image sizes
-            target_class: Target class index for Grad-CAM. If None, uses the predicted class.
-            use_pre_pooling: If True, compute Grad-CAM before pooling for better spatial resolution
-        Returns:
-            dict with keys:
-                - 'heatmap': Grad-CAM heatmap [batch_size, H_feat, W_feat]
-                - 'predicted_class': Predicted class index
-                - 'predicted_prob': Probability of predicted class
-                - 'class_name': Name of the target class
-        """
-        if images is None:
-            raise ValueError("Images are required for Grad-CAM computation")
-        # Enable gradient computation for images
-        # Note: We need to enable train mode for vision tower to compute gradients
-        was_training = self.training
-        was_vision_training = self.get_vision_tower().training
-        # Set vision tower to train mode to enable gradients
-        vision_tower = self.get_vision_tower()
-        vision_tower.train()
-        # CRITICAL: Enable gradients for vision tower parameters
-        # This is necessary because merged LoRA models might have frozen parameters
-        for param in vision_tower.parameters():
-            param.requires_grad = True
-        # Note: We keep model in eval mode for other components (dropout, batchnorm)
-        # but vision tower is in train mode for gradient computation
-        # Ensure images require grad
-        if not images.requires_grad:
-            images = images.clone().detach().requires_grad_(True)
-        logging.info(f"Images requires_grad: {images.requires_grad}")
-        # Store activations and gradients for Grad-CAM
-        activations = []
-        gradients = []
-        def save_gradient(grad):
-            """Backward hook to capture gradients"""
-            logging.info(f"Gradient hook called! Grad shape: {grad.shape}")
-            gradients.append(grad.detach())
-        def forward_hook(module, input, output):
-            """Forward hook to save activations and register backward hook"""
-            if isinstance(output, tuple):
-                activation = output[0]
-            else:
-                activation = output
-            logging.info(f"Forward hook: activation shape={activation.shape}, requires_grad={activation.requires_grad}")
-            # Register backward hook on the activation tensor itself BEFORE saving
-            if activation.requires_grad:
-                activation.register_hook(save_gradient)
-                logging.info("Registered backward hook on activation")
-            else:
-                logging.warning("Activation does not require grad, cannot register backward hook!")
-            # Save activation (keep gradient connection for now, will detach later if needed)
-            activations.append(activation)
-        # Register hook on vision tower
-        vision_tower = self.get_vision_tower()
-        if vision_tower is None:
-            raise AttributeError("Vision tower not found")
-        hook_handle = vision_tower.register_forward_hook(forward_hook)
-        try:
-            # Forward pass - Do normal forward but intercept and modify vision features
-            # CRITICAL: Use autograd.enable_grad() to force gradient tracking
-            # Store original vision tower forward
-            vision_tower = self.get_vision_tower()
-            original_forward = vision_tower.forward
-            # Create a wrapper that forces requires_grad on output
-            def forward_with_grad(*args, **kwargs):
-                output = original_forward(*args, **kwargs)
-                if not output.requires_grad:
-                    output = output.clone().requires_grad_(True)
-                    # Register hook on this tensor
-                    output.register_hook(save_gradient)
-                    # Save to activations
-                    activations.append(output)
-                return output
-            # Temporarily replace forward
-            vision_tower.forward = forward_with_grad
-            try:
-                with torch.enable_grad():
-                    if use_pre_pooling:
-                        # For pre-pooling Grad-CAM, we need to capture the visual tokens from hidden_states
-                        # before they are pooled
-                        pre_pool_activations = []
-                        pre_pool_gradients = []
-                        def save_pre_pool_gradient(grad):
-                            pre_pool_gradients.append(grad)
-                        # Store original pool_visual_tokens method
-                        original_pool_method = self.pool_visual_tokens
-                        # Replace with a wrapper that captures pre-pooling features
-                        def pool_with_capture(hidden_states, input_ids, images):
-                            # Extract visual tokens before pooling
-                            # Visual tokens are typically in the positions where image tokens were
-                            batch_size = hidden_states.shape[0]
-                            # Find image token positions
-                            # The image token index is -200 by default in LLaVA
-                            IMAGE_TOKEN_INDEX = -200
-                            image_token_indices = []
-                            for batch_idx in range(batch_size):
-                                image_positions = (input_ids[batch_idx] == IMAGE_TOKEN_INDEX).nonzero(as_tuple=True)[0]
-                                if len(image_positions) > 0:
-                                    image_token_indices.append(image_positions)
-                            # Extract visual features before pooling
-                            if len(image_token_indices) > 0:
-                                visual_features = hidden_states[0, image_token_indices[0]]  # [num_patches, hidden_dim]
-                                visual_features = visual_features.clone().requires_grad_(True)
-                                pre_pool_activations.append(visual_features)
-                                visual_features.register_hook(save_pre_pool_gradient)
-                            # Call original pooling method
-                            return original_pool_method(hidden_states, input_ids, images)
-                        # Temporarily replace the pooling method
-                        self.pool_visual_tokens = pool_with_capture
-                    # Now do the full forward pass
-                    outputs = self.forward(
-                        input_ids=input_ids,
-                        attention_mask=attention_mask,
-                        images=images,
-                        image_sizes=image_sizes,
-                        do_safety=True,
-                        return_dict=True,
-                        **kwargs
-                    )
-                    img_safety_logits = outputs.img_safety_logits
-                    img_safety_probs = outputs.img_safety_probs
-                    if use_pre_pooling:
-                        # Restore original pooling method
-                        self.pool_visual_tokens = original_pool_method
-            finally:
-                # Restore original forward
-                vision_tower.forward = original_forward
-                # Get predicted class if not specified
-                if target_class is None:
-                    # Use the class with highest probability
-                    target_class = img_safety_probs.argmax(dim=-1)
-                else:
-                    # Ensure target_class is a tensor
-                    if isinstance(target_class, int):
-                        target_class = torch.tensor([target_class], device=img_safety_probs.device)
-                # Get the logit for the target class
-                batch_size = img_safety_probs.shape[0]
-                target_logits = img_safety_logits[torch.arange(batch_size), target_class]
-                # Backward pass to compute gradients
-                self.zero_grad()
-                target_logits.sum().backward()
-                # Choose which activations and gradients to use
-                if use_pre_pooling:
-                    # Use pre-pooling features for better spatial resolution
-                    if 'pre_pool_activations' not in locals() or len(pre_pool_activations) == 0:
-                        raise RuntimeError("Failed to capture pre-pooling activations")
-                    if 'pre_pool_gradients' not in locals() or len(pre_pool_gradients) == 0:
-                        raise RuntimeError("Failed to capture pre-pooling gradients")
-                    # Get the pre-pooling features
-                    # These have spatial structure: [num_patches, hidden_dim]
-                    activation = pre_pool_activations[0].detach()
-                    gradient = pre_pool_gradients[0]
-                    # Add batch dimension if needed for consistency
-                    if activation.dim() == 2:
-                        activation = activation.unsqueeze(0)  # [1, num_patches, hidden_dim]
-                        gradient = gradient.unsqueeze(0)
-                else:
-                    # Use post-pooling features (original behavior - from vision tower)
-                    if len(activations) == 0:
-                        raise RuntimeError("Failed to capture activations")
-                    if len(gradients) == 0:
-                        raise RuntimeError("Failed to capture gradients")
-                    activation = activations[0].detach()  # [batch_size, num_patches, hidden_dim]
-                    gradient = gradients[0]               # [batch_size, num_patches, hidden_dim]
-                # Compute Grad-CAM with correct formula
-                # For Vision Transformer: gradients and activations are [batch, num_patches, hidden_dim]
-                # Standard Grad-CAM: compute importance by averaging gradients across hidden dimension
-                # Then weight the activations
-                # Option 1: Standard Grad-CAM - use gradient magnitude as importance
-                # This captures which patches have the strongest gradient signal
-                cam = (gradient * activation).sum(dim=-1)  # [batch_size, num_patches]
-                # Alternative would be:
-                # weights = gradient.mean(dim=1, keepdim=True)  # Average across patches
-                # cam = (activation * weights).sum(dim=-1)
-                # Apply ReLU (only positive contributions)
-                cam = torch.nn.functional.relu(cam)
-                # Reshape to 2D spatial grid
-                # CLIP ViT-L/14-336px has 24x24 patches
-                num_patches_per_side = int(cam.shape[1] ** 0.5)
-                cam = cam.reshape(batch_size, num_patches_per_side, num_patches_per_side)
-                # Normalize to [0, 1]
-                for i in range(batch_size):
-                    cam_min = cam[i].min()
-                    cam_max = cam[i].max()
-                    if cam_max > cam_min:
-                        cam[i] = (cam[i] - cam_min) / (cam_max - cam_min)
-                # Get class names
-                if isinstance(target_class, torch.Tensor):
-                    target_class_idx = target_class[0].item()
-                else:
-                    target_class_idx = target_class
-                class_name = self.config.safety_categories[target_class_idx]
-                return {
-                    'heatmap': cam.detach().cpu().numpy(),
-                    'predicted_class': target_class.cpu().numpy() if isinstance(target_class, torch.Tensor) else target_class,
-                    'predicted_prob': img_safety_probs[torch.arange(batch_size), target_class].detach().cpu().numpy(),
-                    'class_name': class_name,
-                    'all_probs': img_safety_probs.detach().cpu().numpy()
-                }
-        finally:
-            # Remove hook
-            hook_handle.remove()
-            # Restore training state
-            if not was_vision_training:
-                self.get_vision_tower().eval()
-            if was_training:
-                self.train()
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        position_ids=None,
-        past_key_values=None,
-        inputs_embeds=None,
-        labels=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        images=None,
-        image_sizes=None,
-        return_dict=None,
-        do_safety=False,
-        **kwargs,
-    ) -> Union[Tuple, CausalLMOutputWithPast, SafetyCausalLMOutputWithPast]:
-        """
-        Forward method for SafeLLaVA-Pool.
-        When do_safety=True, extracts and pools visual tokens for safety classification.
-        """
-        # Store original input_ids for finding image token positions
-        original_input_ids = input_ids.clone() if input_ids is not None else None
-        # If do_safety is True, force output_hidden_states to True
-        if do_safety and (output_hidden_states is not True):
-            output_hidden_states = True
-            return_dict = True
-        # Prepare inputs for multimodal (handles image embedding)
-        if inputs_embeds is None:
-            (
-                input_ids,
-                position_ids,
-                attention_mask,
-                past_key_values,
-                inputs_embeds,
-                labels
-            ) = self.prepare_inputs_labels_for_multimodal(
-                input_ids,
-                position_ids,
-                attention_mask,
-                past_key_values,
-                labels,
-                images,
-                image_sizes
-            )
-        # Call parent's forward method
-        outputs = super(LlavaLlamaForCausalLM, self).forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            labels=labels,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=True,
-            **kwargs
-        )
-        # If do_safety=False, just return the outputs
-        if not do_safety:
-            if return_dict is False:
-                return (outputs.loss, outputs.logits, outputs.past_key_values,
-                       outputs.hidden_states, outputs.attentions)
-            return outputs
-        # Safety classification using pooled visual tokens
-        hidden_states = outputs.hidden_states[-1]  # Last layer hidden states
-        # Check if we have images to process
-        if images is None:
-            # No images, return outputs without safety
-            return outputs
-        # Pool visual tokens
-        pooled_visual_features = self.pool_visual_tokens(hidden_states, original_input_ids, images)
-        # Pass through safety head
-        img_safety_logits = self.img_safety_head(pooled_visual_features)
-        img_safety_probs = torch.softmax(img_safety_logits, dim=-1)
-        # Return results with safety outputs
-        if not return_dict:
-            return (outputs.loss, outputs.logits, outputs.past_key_values,
-                   outputs.hidden_states, outputs.attentions,
-                   img_safety_logits, img_safety_probs)
-        return SafetyCausalLMOutputWithPast(
-            loss=outputs.loss,
-            logits=outputs.logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            img_safety_logits=img_safety_logits,
-            img_safety_probs=img_safety_probs,
-            txt_safety_logits=None,  # Not used in Pool version
-            txt_safety_probs=None,
-            total_safety_logits=None,
-            total_safety_probs=None
-        )
-# Register the model
-AutoConfig.register("safe_llava_llama_pool", SafetyConfig)
-AutoModelForCausalLM.register(SafetyConfig, SafeLlavaLlamaForCausalLM)