Spaces:

sagar007
/

multimodal-gemma-270m-demo

Runtime error

App Files Files Community

sagar007 commited on Sep 20

Commit

9fc75bd

verified ·

1 Parent(s): cb26575

Upload folder using huggingface_hub

Browse files

Files changed (7) hide show

app.py +349 -0
requirements.txt +12 -0
src/__init__.py +0 -0
src/models/__init__.py +10 -0
src/models/multimodal_gemma.py +323 -0
src/utils/__init__.py +8 -0
src/utils/config.py +110 -0

app.py ADDED Viewed

	@@ -0,0 +1,349 @@

+#!/usr/bin/env python3
+"""
+Gradio UI for Multimodal Gemma Model - Hugging Face Space Version
+"""
+import sys
+import torch
+import gradio as gr
+from pathlib import Path
+from PIL import Image
+import io
+import time
+import logging
+from huggingface_hub import hf_hub_download
+# Model imports
+from src.models import MultimodalGemmaLightning
+from src.utils.config import load_config, merge_configs
+# Global model variable
+model = None
+config = None
+def download_and_load_model():
+    """Download and load the trained multimodal model from HF"""
+    global model, config
+    if model is not None:
+        return "✅ Model already loaded!"
+    try:
+        print("🔄 Downloading multimodal Gemma model from HF...")
+        # Download model checkpoint
+        checkpoint_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="final_model.ckpt",
+            cache_dir="./model_cache"
+        )
+        # Download config files
+        model_config_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="configs/model_config.yaml",
+            cache_dir="./model_cache"
+        )
+        training_config_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="configs/training_config.yaml",
+            cache_dir="./model_cache"
+        )
+        data_config_path = hf_hub_download(
+            repo_id="sagar007/multimodal-gemma-270m-llava",
+            filename="configs/data_config.yaml",
+            cache_dir="./model_cache"
+        )
+        # Load configs
+        model_config = load_config(model_config_path)
+        training_config = load_config(training_config_path)
+        data_config = load_config(data_config_path)
+        config = merge_configs([model_config, training_config, data_config])
+        print("📁 Loading model from checkpoint...")
+        model = MultimodalGemmaLightning.load_from_checkpoint(
+            checkpoint_path,
+            config=config,
+            strict=False,
+            map_location="cuda" if torch.cuda.is_available() else "cpu"
+        )
+        model.eval()
+        # Move to appropriate device
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model = model.to(device)
+        print(f"✅ Model loaded successfully on {device}!")
+        return f"✅ Model loaded successfully on {device}!"
+    except Exception as e:
+        error_msg = f"❌ Error loading model: {str(e)}"
+        print(error_msg)
+        return error_msg
+def predict_with_image(image, question, max_tokens=100, temperature=0.7):
+    """Generate response for image + text input"""
+    global model, config
+    if model is None:
+        return "❌ Please load the model first using the 'Load Model' button!"
+    if image is None:
+        return "❌ Please upload an image!"
+    if not question.strip():
+        question = "What do you see in this image?"
+    try:
+        # Get device
+        device = next(model.parameters()).device
+        # Process image
+        if isinstance(image, str):
+            image = Image.open(image).convert('RGB')
+        elif not isinstance(image, Image.Image):
+            image = Image.fromarray(image).convert('RGB')
+        # Prepare image for model
+        vision_inputs = model.model.vision_processor(
+            images=[image],
+            return_tensors="pt"
+        )
+        pixel_values = vision_inputs["pixel_values"].to(device)
+        # Prepare text prompt
+        prompt = f"<image>\\nHuman: {question}\\nAssistant:"
+        # Tokenize text
+        text_inputs = model.model.tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=256
+        )
+        input_ids = text_inputs["input_ids"].to(device)
+        attention_mask = text_inputs["attention_mask"].to(device)
+        # Generate response
+        with torch.no_grad():
+            # Use the full multimodal model with image inputs
+            outputs = model.model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                images=pixel_values,
+                max_new_tokens=min(max_tokens, 150),
+                temperature=min(max(temperature, 0.1), 2.0),
+                do_sample=temperature > 0.1,
+                repetition_penalty=1.1
+            )
+        # Decode response
+        input_length = input_ids.shape[1]
+        generated_tokens = outputs[0][input_length:]
+        response = model.model.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+        # Clean up response
+        response = response.strip()
+        if not response:
+            response = "I can see the image, but I'm having trouble generating a detailed response."
+        return response
+    except Exception as e:
+        error_msg = f"❌ Error during inference: {str(e)}"
+        print(error_msg)
+        return error_msg
+def chat_with_image(image, question, history, max_tokens, temperature):
+    """Chat interface function"""
+    if model is None:
+        response = "❌ Please load the model first!"
+    else:
+        response = predict_with_image(image, question, max_tokens, temperature)
+    # Add to history - using messages format
+    history.append({"role": "user", "content": question})
+    history.append({"role": "assistant", "content": response})
+    return history, ""
+def create_gradio_interface():
+    """Create the Gradio interface"""
+    # Custom CSS for better styling
+    css = """
+    .container {
+        max-width: 1200px;
+        margin: auto;
+        padding: 20px;
+    }
+    .header {
+        text-align: center;
+        margin-bottom: 30px;
+    }
+    .model-info {
+        background-color: #f0f8ff;
+        padding: 15px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+    }
+    """
+    with gr.Blocks(css=css, title="Multimodal Gemma Chat") as demo:
+        gr.HTML("""
+        <div class="header">
+            <h1>🎉 Multimodal Gemma-270M Chat</h1>
+            <p>Upload an image and chat with your trained vision-language model!</p>
+            <p><a href="https://huggingface.co/sagar007/multimodal-gemma-270m-llava">🤗 Model</a></p>
+        </div>
+        """)
+        # Model status section
+        with gr.Row():
+            with gr.Column():
+                gr.HTML("""
+                <div class="model-info">
+                    <h3>📊 Model Info</h3>
+                    <ul>
+                        <li><strong>Base Model:</strong> Google Gemma-270M</li>
+                        <li><strong>Vision:</strong> CLIP ViT-Large</li>
+                        <li><strong>Training:</strong> LLaVA-150K + COCO Images</li>
+                        <li><strong>Parameters:</strong> 18.6M trainable / 539M total</li>
+                    </ul>
+                </div>
+                """)
+                # Model loading
+                load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
+                model_status = gr.Textbox(
+                    label="Model Status",
+                    value="Click 'Load Model' to start",
+                    interactive=False
+                )
+        gr.HTML("<hr>")
+        # Main interface
+        with gr.Row():
+            # Left column - Image and controls
+            with gr.Column(scale=1):
+                image_input = gr.Image(
+                    label="📸 Upload Image",
+                    type="pil",
+                    height=300
+                )
+                # Example images
+                gr.HTML("<p><strong>💡 Tip:</strong> Upload any image and ask questions about it</p>")
+                # Generation settings
+                with gr.Accordion("⚙️ Generation Settings", open=False):
+                    max_tokens = gr.Slider(
+                        minimum=10,
+                        maximum=200,
+                        value=100,
+                        step=10,
+                        label="Max Tokens"
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=2.0,
+                        value=0.7,
+                        step=0.1,
+                        label="Temperature"
+                    )
+            # Right column - Chat interface
+            with gr.Column(scale=2):
+                chatbot = gr.Chatbot(
+                    label="💬 Chat with Image",
+                    height=400,
+                    show_label=True,
+                    type="messages"
+                )
+                question_input = gr.Textbox(
+                    label="❓ Ask about the image",
+                    placeholder="What do you see in this image?",
+                    lines=2
+                )
+                with gr.Row():
+                    submit_btn = gr.Button("💬 Send", variant="primary")
+                    clear_btn = gr.Button("🗑️ Clear Chat")
+        # Example prompts
+        with gr.Row():
+            gr.HTML("<h3>💡 Example Questions:</h3>")
+        with gr.Row():
+            example_questions = [
+                "What do you see in this image?",
+                "Describe the main objects in the picture.",
+                "What colors are prominent in this image?",
+                "Are there any people in the image?",
+                "What's the setting or location?",
+                "What objects are in the foreground?"
+            ]
+            for i, question in enumerate(example_questions):
+                if i % 3 == 0:
+                    with gr.Row():
+                        pass
+                gr.Button(
+                    question,
+                    size="sm"
+                ).click(
+                    lambda x=question: x,
+                    outputs=question_input
+                )
+        # Footer
+        gr.HTML("""
+        <hr>
+        <div style="text-align: center; margin-top: 20px;">
+            <p><strong>🎯 Your Multimodal Gemma Model</strong></p>
+            <p>Text-only → Vision-Language Model using LLaVA Architecture</p>
+            <p>Model: <a href="https://huggingface.co/sagar007/multimodal-gemma-270m-llava">sagar007/multimodal-gemma-270m-llava</a></p>
+        </div>
+        """)
+        # Event handlers
+        load_btn.click(
+            fn=download_and_load_model,
+            outputs=model_status
+        )
+        submit_btn.click(
+            fn=chat_with_image,
+            inputs=[image_input, question_input, chatbot, max_tokens, temperature],
+            outputs=[chatbot, question_input]
+        )
+        question_input.submit(
+            fn=chat_with_image,
+            inputs=[image_input, question_input, chatbot, max_tokens, temperature],
+            outputs=[chatbot, question_input]
+        )
+        clear_btn.click(
+            fn=lambda: ([], ""),
+            outputs=[chatbot, question_input]
+        )
+    return demo
+def main():
+    """Main function to launch the Gradio app"""
+    print("🚀 Starting Multimodal Gemma Gradio Space...")
+    # Create interface
+    demo = create_gradio_interface()
+    # Launch
+    print("🌐 Launching Gradio interface...")
+    demo.launch()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch>=2.0.0
+torchvision
+transformers>=4.36.0
+accelerate
+bitsandbytes
+peft>=0.6.0
+lightning>=2.0.0
+gradio>=4.0.0
+pillow
+huggingface-hub
+pyyaml
+omegaconf

src/__init__.py ADDED Viewed

File without changes

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from .multimodal_gemma import MultimodalGemma
+from .lightning_module import MultimodalGemmaLightning
+from .projectors import VisionProjector, AudioProjector
+__all__ = [
+    "MultimodalGemma",
+    "MultimodalGemmaLightning",
+    "VisionProjector",
+    "AudioProjector"
+]

src/models/multimodal_gemma.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Multimodal Gemma model implementation
+"""
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    CLIPVisionModel,
+    CLIPProcessor,
+    BitsAndBytesConfig
+)
+from peft import LoraConfig, get_peft_model, TaskType
+from typing import Dict, Any, Optional, Tuple
+import logging
+from .projectors import VisionProjector
+logger = logging.getLogger(__name__)
+class MultimodalGemma(nn.Module):
+    """Multimodal Gemma model with vision and audio capabilities"""
+    def __init__(self, config: Dict[str, Any]):
+        super().__init__()
+        self.config = config
+        # Initialize tokenizer first
+        self._setup_tokenizer()
+        # Initialize language model
+        self._setup_language_model()
+        # Initialize vision components
+        self._setup_vision_components()
+        # Initialize projectors
+        self._setup_projectors()
+        # Freeze encoders
+        self._freeze_encoders()
+        # Setup LoRA
+        self._setup_lora()
+        logger.info("MultimodalGemma model initialized successfully")
+        # Move projectors to the same device as the language model
+        self._move_to_device()
+    def _setup_tokenizer(self):
+        """Initialize and configure tokenizer"""
+        model_name = self.config["model"]["gemma_model_name"]
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_name,
+            trust_remote_code=True,
+            use_fast=True
+        )
+        # Set padding token
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        # Add special tokens
+        special_tokens = self.config.get("special_tokens", {})
+        new_tokens = []
+        for token_name, token_value in special_tokens.items():
+            if token_value not in self.tokenizer.get_vocab():
+                new_tokens.append(token_value)
+        if new_tokens:
+            self.tokenizer.add_special_tokens({"additional_special_tokens": new_tokens})
+            logger.info(f"Added special tokens: {new_tokens}")
+    def _setup_language_model(self):
+        """Initialize language model with quantization if specified"""
+        model_name = self.config["model"]["gemma_model_name"]
+        # Setup quantization config
+        quantization_config = None
+        if self.config["model"].get("use_4bit", False):
+            quantization_config = BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=getattr(torch, self.config["model"]["bnb_4bit_compute_dtype"]),
+                bnb_4bit_quant_type=self.config["model"]["bnb_4bit_quant_type"],
+                bnb_4bit_use_double_quant=self.config["model"]["use_nested_quant"]
+            )
+        self.language_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+            device_map=None,  # Lightning handles device placement
+            trust_remote_code=True,
+            attn_implementation="eager"  # Use eager attention (flash_attn not required)
+        )
+        # Resize embeddings if we added special tokens
+        if len(self.tokenizer) > self.language_model.config.vocab_size:
+            self.language_model.resize_token_embeddings(len(self.tokenizer))
+            logger.info(f"Resized embeddings to {len(self.tokenizer)}")
+        # Store image token ID for later use
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(
+            self.config.get("special_tokens", {}).get("image_token", "<image>")
+        )
+    def _setup_vision_components(self):
+        """Initialize vision encoder and processor"""
+        vision_model_name = self.config["model"]["vision_model_name"]
+        self.vision_encoder = CLIPVisionModel.from_pretrained(
+            vision_model_name,
+            torch_dtype=torch.bfloat16
+        )
+        self.vision_processor = CLIPProcessor.from_pretrained(vision_model_name)
+        logger.info(f"Loaded vision model: {vision_model_name}")
+    def _setup_projectors(self):
+        """Initialize projection layers"""
+        vision_dim = self.vision_encoder.config.hidden_size
+        language_dim = self.language_model.config.hidden_size
+        # Vision projector
+        self.vision_projector = VisionProjector(
+            vision_dim=vision_dim,
+            language_dim=language_dim,
+            hidden_dim=self.config["model"].get("projector_hidden_dim", language_dim)
+        ).to(torch.bfloat16)  # Match the model dtype
+        logger.info("Initialized vision projection layer")
+    def _freeze_encoders(self):
+        """Freeze vision encoder"""
+        # Freeze vision encoder
+        for param in self.vision_encoder.parameters():
+            param.requires_grad = False
+        logger.info("Froze vision encoder parameters")
+    def _setup_lora(self):
+        """Setup LoRA for the language model"""
+        lora_config = LoraConfig(
+            r=self.config["model"]["lora"]["r"],
+            lora_alpha=self.config["model"]["lora"]["alpha"],
+            target_modules=self.config["model"]["lora"]["target_modules"],
+            lora_dropout=self.config["model"]["lora"]["dropout"],
+            bias="none",
+            task_type=TaskType.CAUSAL_LM,
+        )
+        self.language_model = get_peft_model(self.language_model, lora_config)
+        self.language_model.print_trainable_parameters()
+        logger.info("Setup LoRA adapters")
+    def _move_to_device(self):
+        """Move all components to the same device as the language model"""
+        device = next(self.language_model.parameters()).device
+        # Move vision components
+        self.vision_encoder = self.vision_encoder.to(device)
+        self.vision_projector = self.vision_projector.to(device)
+        logger.info(f"Moved vision components to device: {device}")
+    def encode_images(self, images: torch.Tensor) -> torch.Tensor:
+        """
+        Encode images using CLIP and project to language space
+        Args:
+            images: [batch_size, 3, height, width]
+        Returns:
+            projected_features: [batch_size, language_dim]
+        """
+        with torch.no_grad():
+            vision_outputs = self.vision_encoder(pixel_values=images)
+            # Use the pooled output (CLS token equivalent)
+            image_features = vision_outputs.pooler_output
+        # Project to language model space
+        projected_features = self.vision_projector(image_features)
+        return projected_features
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass with multimodal inputs
+        Args:
+            input_ids: [batch_size, seq_len]
+            attention_mask: [batch_size, seq_len]
+            images: [batch_size, 3, height, width] or None
+            labels: [batch_size, seq_len] or None
+        Returns:
+            Dictionary with loss and logits
+        """
+        if images is not None:
+            # Encode images and project to language space
+            image_features = self.encode_images(images)  # [batch_size, language_dim]
+            # Replace <image> tokens with actual image features
+            input_embeds, attention_mask, labels = self._merge_image_features(
+                input_ids, image_features, attention_mask, labels
+            )
+            # Forward through language model with merged embeddings
+            outputs = self.language_model(
+                inputs_embeds=input_embeds,
+                attention_mask=attention_mask,
+                labels=labels,
+            )
+        else:
+            # Standard text-only forward pass
+            outputs = self.language_model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                labels=labels,
+            )
+        return {
+            "loss": outputs.loss,
+            "logits": outputs.logits,
+        }
+    def _merge_image_features(
+        self,
+        input_ids: torch.Tensor,
+        image_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Merge image features with text embeddings at <image> token positions
+        Args:
+            input_ids: [batch_size, seq_len]
+            image_features: [batch_size, language_dim]
+            attention_mask: [batch_size, seq_len]
+            labels: [batch_size, seq_len]
+        Returns:
+            input_embeds: [batch_size, seq_len, hidden_size]
+            attention_mask: [batch_size, seq_len]
+            labels: [batch_size, seq_len]
+        """
+        batch_size, seq_len = input_ids.shape
+        # Get text embeddings
+        text_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # Find positions of <image> tokens
+        image_token_mask = (input_ids == self.image_token_id)
+        # Replace <image> token embeddings with projected image features
+        for batch_idx in range(batch_size):
+            image_positions = torch.where(image_token_mask[batch_idx])[0]
+            if len(image_positions) > 0:
+                # Use the first <image> token position (assuming one image per sample)
+                img_pos = image_positions[0]
+                text_embeds[batch_idx, img_pos] = image_features[batch_idx]
+        return text_embeds, attention_mask, labels
+    def generate(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        max_new_tokens: int = 150,
+        temperature: float = 0.7,
+        do_sample: bool = True,
+        **kwargs
+    ) -> torch.Tensor:
+        """Generate text with multimodal context"""
+        if images is not None:
+            # Encode images and merge with text embeddings
+            image_features = self.encode_images(images)
+            input_embeds, attention_mask, _ = self._merge_image_features(
+                input_ids, image_features, attention_mask, None
+            )
+            # Generate using language model with merged embeddings
+            with torch.no_grad():
+                outputs = self.language_model.generate(
+                    inputs_embeds=input_embeds,
+                    attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    do_sample=do_sample,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    **kwargs
+                )
+        else:
+            # Standard text-only generation
+            with torch.no_grad():
+                outputs = self.language_model.generate(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    do_sample=do_sample,
+                    pad_token_id=self.tokenizer.pad_token_id,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    **kwargs
+                )
+        return outputs

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .config import load_config, merge_configs
+from .logging import setup_logging
+__all__ = [
+    "load_config",
+    "merge_configs",
+    "setup_logging"
+]

src/utils/config.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""
+Configuration utilities
+"""
+import yaml
+from pathlib import Path
+from typing import Dict, Any, List, Union
+import logging
+logger = logging.getLogger(__name__)
+def load_config(config_path: Union[str, Path]) -> Dict[str, Any]:
+    """Load configuration from YAML file"""
+    config_path = Path(config_path)
+    if not config_path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    try:
+        with open(config_path, 'r', encoding='utf-8') as file:
+            config = yaml.safe_load(file)
+        logger.info(f"Loaded configuration from: {config_path}")
+        return config
+    except Exception as e:
+        logger.error(f"Failed to load configuration from {config_path}: {e}")
+        raise
+def merge_configs(configs: List[Dict[str, Any]]) -> Dict[str, Any]:
+    """Merge multiple configuration dictionaries"""
+    merged = {}
+    for config in configs:
+        merged.update(config)
+    logger.info(f"Merged {len(configs)} configuration files")
+    return merged
+def save_config(config: Dict[str, Any], save_path: Union[str, Path]) -> None:
+    """Save configuration to YAML file"""
+    save_path = Path(save_path)
+    save_path.parent.mkdir(parents=True, exist_ok=True)
+    try:
+        with open(save_path, 'w', encoding='utf-8') as file:
+            yaml.dump(config, file, default_flow_style=False, indent=2)
+        logger.info(f"Saved configuration to: {save_path}")
+    except Exception as e:
+        logger.error(f"Failed to save configuration to {save_path}: {e}")
+        raise
+def validate_config(config: Dict[str, Any]) -> bool:
+    """Validate configuration structure"""
+    required_sections = ["model", "training", "data"]
+    for section in required_sections:
+        if section not in config:
+            logger.error(f"Missing required configuration section: {section}")
+            return False
+    # Validate model config
+    model_config = config["model"]
+    required_model_keys = ["gemma_model_name", "vision_model_name", "lora"]
+    for key in required_model_keys:
+        if key not in model_config:
+            logger.error(f"Missing required model config key: {key}")
+            return False
+    # Validate training config
+    training_config = config["training"]
+    required_training_keys = ["max_epochs", "batch_size", "lora_lr", "projector_lr"]
+    for key in required_training_keys:
+        if key not in training_config:
+            logger.error(f"Missing required training config key: {key}")
+            return False
+    # Validate data config
+    data_config = config["data"]
+    required_data_keys = ["dataset_name", "max_length", "image_size"]
+    for key in required_data_keys:
+        if key not in data_config:
+            logger.error(f"Missing required data config key: {key}")
+            return False
+    logger.info("Configuration validation passed")
+    return True
+def update_config(config: Dict[str, Any], updates: Dict[str, Any]) -> Dict[str, Any]:
+    """Update configuration with new values"""
+    def deep_update(base_dict, update_dict):
+        """Recursively update nested dictionaries"""
+        for key, value in update_dict.items():
+            if isinstance(value, dict) and key in base_dict and isinstance(base_dict[key], dict):
+                deep_update(base_dict[key], value)
+            else:
+                base_dict[key] = value
+    import copy
+    updated_config = copy.deepcopy(config)
+    deep_update(updated_config, updates)
+    logger.info("Configuration updated")
+    return updated_config