Spaces:

Shak33l-UiRev
/

Ui-Rev-Doc-Model

Runtime error

App Files Files Community

Ui-Rev-Doc-Model / app.py

Shak33l-UiRev

updated device management

dea33ff verified 12 months ago

raw

history blame

18 kB

	import streamlit as st
	from PIL import Image
	import torch
	from transformers import (
	DonutProcessor,
	VisionEncoderDecoderModel,
	LayoutLMv3Processor,
	LayoutLMv3ForSequenceClassification,
	AutoProcessor,
	AutoModelForCausalLM
	)
	from ultralytics import YOLO
	import io
	import base64
	import json
	from datetime import datetime

	@st.cache_resource
	def load_model(model_name):
	"""Load the selected model and processor

	Args:
	model_name (str): Name of the model to load ("Donut", "LayoutLMv3", or "OmniParser")

	Returns:
	dict: Dictionary containing model components
	"""
	try:
	if model_name == "Donut":
	processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base")
	model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")
	# Configure Donut specific parameters
	model.config.decoder_start_token_id = processor.tokenizer.bos_token_id
	model.config.pad_token_id = processor.tokenizer.pad_token_id
	model.config.vocab_size = len(processor.tokenizer)

	return {'model': model, 'processor': processor}

	elif model_name == "LayoutLMv3":
	processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
	model = LayoutLMv3ForSequenceClassification.from_pretrained("microsoft/layoutlmv3-base")

	return {'model': model, 'processor': processor}

	elif model_name == "OmniParser":
	# Load YOLO model for icon detection
	yolo_model = YOLO("microsoft/OmniParser")

	# Load Florence-2 processor and model for captioning
	processor = AutoProcessor.from_pretrained(
	"microsoft/Florence-2-base",
	trust_remote_code=True
	)

	# Load the captioning model
	caption_model = AutoModelForCausalLM.from_pretrained(
	"microsoft/OmniParser",
	trust_remote_code=True
	)

	return {
	'yolo': yolo_model,
	'processor': processor,
	'model': caption_model
	}

	else:
	raise ValueError(f"Unknown model name: {model_name}")

	except Exception as e:
	st.error(f"Error loading model {model_name}: {str(e)}")
	return None

	@spaces.GPU
	@torch.inference_mode()
	def analyze_document(image, model_name, models_dict):
	"""Analyze document using selected model

	Args:
	image (PIL.Image): Input image to analyze
	model_name (str): Name of the model to use ("Donut", "LayoutLMv3", or "OmniParser")
	models_dict (dict): Dictionary containing loaded model components

	Returns:
	dict: Analysis results including detected elements, text, and/or coordinates
	"""
	try:
	if models_dict is None:
	return {"error": "Model failed to load", "type": "model_error"}

	if model_name == "OmniParser":
	# Configure detection parameters
	box_threshold = 0.05 # Confidence threshold for detection
	iou_threshold = 0.1 # IoU threshold for NMS

	# Save image temporarily for YOLO processing
	temp_path = "temp_image.png"
	image.save(temp_path)

	# Run YOLO detection
	yolo_results = models_dict['yolo'](
	temp_path,
	conf=box_threshold,
	iou=iou_threshold
	)

	# Process detections and generate captions
	results = []
	for det in yolo_results[0].boxes.data:
	x1, y1, x2, y2, conf, cls = det

	# Get region of interest
	roi = image.crop((int(x1), int(y1), int(x2), int(y2)))

	# Generate caption using the model
	inputs = models_dict['processor'](
	images=roi,
	return_tensors="pt"
	)

	outputs = models_dict['model'].generate(
	**inputs,
	max_length=50,
	num_beams=4,
	temperature=0.7
	)

	caption = models_dict['processor'].decode(outputs[0], skip_special_tokens=True)

	results.append({
	"bbox": [float(x) for x in [x1, y1, x2, y2]],
	"confidence": float(conf),
	"class": int(cls),
	"caption": caption
	})

	# Clean up temporary file
	if os.path.exists(temp_path):
	os.remove(temp_path)

	return {
	"detected_elements": len(results),
	"elements": results
	}

	elif model_name == "Donut":
	# Process image with Donut
	pixel_values = models_dict['processor'](image, return_tensors="pt").pixel_values

	task_prompt = "<s_cord>analyze the document and extract information</s_cord>"
	decoder_input_ids = models_dict['processor'].tokenizer(
	task_prompt,
	add_special_tokens=False,
	return_tensors="pt"
	).input_ids

	outputs = models_dict['model'].generate(
	pixel_values,
	decoder_input_ids=decoder_input_ids,
	max_length=512,
	early_stopping=True,
	pad_token_id=models_dict['processor'].tokenizer.pad_token_id,
	eos_token_id=models_dict['processor'].tokenizer.eos_token_id,
	use_cache=True,
	num_beams=4,
	bad_words_ids=[[models_dict['processor'].tokenizer.unk_token_id]],
	return_dict_in_generate=True
	)

	sequence = models_dict['processor'].batch_decode(outputs.sequences)[0]
	sequence = sequence.replace(task_prompt, "").replace("</s_cord>", "").strip()

	try:
	result = json.loads(sequence)
	except json.JSONDecodeError:
	result = {"raw_text": sequence}

	return result

	elif model_name == "LayoutLMv3":
	# Process image with LayoutLMv3
	encoded_inputs = models_dict['processor'](
	image,
	return_tensors="pt",
	add_special_tokens=True,
	return_offsets_mapping=True
	)

	outputs = models_dict['model'](**encoded_inputs)
	predictions = outputs.logits.argmax(-1).squeeze().tolist()

	# Convert predictions to labels
	words = models_dict['processor'].tokenizer.convert_ids_to_tokens(
	encoded_inputs.input_ids.squeeze().tolist()
	)

	result = {
	"predictions": [
	{
	"text": word,
	"label": pred
	}
	for word, pred in zip(words, predictions)
	if word not in ["<s>", "</s>", "<pad>"]
	],
	"confidence_scores": outputs.logits.softmax(-1).max(-1).values.squeeze().tolist()
	}

	return result

	else:
	return {"error": f"Unknown model: {model_name}", "type": "model_error"}

	except Exception as e:
	import traceback
	error_details = traceback.format_exc()
	return {
	"error": str(e),
	"type": "processing_error",
	"details": error_details
	}

	# Set page config with improved layout
	st.set_page_config(
	page_title="Document Analysis Comparison",
	layout="wide",
	initial_sidebar_state="expanded"
	)

	# Add custom CSS for better styling
	st.markdown("""
	<style>
	.stAlert {
	margin-top: 1rem;
	}
	.upload-text {
	font-size: 1.2rem;
	margin-bottom: 1rem;
	}
	.model-info {
	padding: 1rem;
	border-radius: 0.5rem;
	background-color: #f8f9fa;
	}
	</style>
	""", unsafe_allow_html=True)

	# Title and description
	st.title("Document Understanding Model Comparison")
	st.markdown("""
	Compare different models for document analysis and understanding.
	Upload an image and select a model to analyze it.
	""")

	# Create two columns for layout
	col1, col2 = st.columns([1, 1])

	with col1:
	# File uploader with improved error handling
	uploaded_file = st.file_uploader(
	"Choose a document image",
	type=['png', 'jpg', 'jpeg', 'pdf'],
	help="Supported formats: PNG, JPEG, PDF"
	)

	if uploaded_file is not None:
	try:
	# Display uploaded image
	image = Image.open(uploaded_file)
	st.image(image, caption='Uploaded Document', use_column_width=True)
	except Exception as e:
	st.error(f"Error loading image: {str(e)}")

	with col2:
	# Model selection with detailed information
	model_info = {
	"Donut": {
	"description": "Best for structured OCR and document format understanding",
	"memory": "6-8GB",
	"strengths": ["Structured OCR", "Memory efficient", "Good with fixed formats"],
	"best_for": ["Invoices", "Forms", "Structured documents", "Tables"]
	},
	"LayoutLMv3": {
	"description": "Strong layout understanding with reasoning capabilities",
	"memory": "12-15GB",
	"strengths": ["Layout understanding", "Reasoning", "Pre-trained knowledge"],
	"best_for": ["Complex documents", "Mixed layouts", "Documents with tables", "Multi-column text"]
	},
	"OmniParser": {
	"description": "General screen parsing tool for UI understanding",
	"memory": "8-10GB",
	"strengths": ["UI element detection", "Interactive element recognition", "Function description"],
	"best_for": ["Screenshots", "UI analysis", "Interactive elements", "Web interfaces"]
	}
	}

	selected_model = st.selectbox(
	"Select Model",
	list(model_info.keys())
	)

	# Display enhanced model information
	st.markdown("### Model Details")
	with st.expander("Model Information", expanded=True):
	st.markdown(f"Description: {model_info[selected_model]['description']}")
	st.markdown(f"Memory Required: {model_info[selected_model]['memory']}")
	st.markdown("Strengths:")
	for strength in model_info[selected_model]['strengths']:
	st.markdown(f"- {strength}")
	st.markdown("Best For:")
	for use_case in model_info[selected_model]['best_for']:
	st.markdown(f"- {use_case}")

	# Inside the analysis section, replace the existing if-block with:
	if uploaded_file is not None and selected_model:
	if st.button("Analyze Document", help="Click to start document analysis"):
	# Create two columns for results and debug info
	result_col, debug_col = st.columns([1, 1])

	with st.spinner('Processing...'):
	try:
	# Create a progress bar in results column
	with result_col:
	st.markdown("### Analysis Progress")
	progress_bar = st.progress(0)

	# Initialize debug column
	with debug_col:
	st.markdown("### Debug Information")
	debug_container = st.empty()

	def update_debug(message, level="info"):
	"""Update debug information with timestamp"""
	timestamp = datetime.now().strftime("%H:%M:%S.%f")[:-3]
	color = {
	"info": "blue",
	"warning": "orange",
	"error": "red",
	"success": "green"
	}.get(level, "black")

	return f"<div style='color: {color};'>[{timestamp}] {message}</div>"

	debug_messages = []

	def add_debug(message, level="info"):
	debug_messages.append(update_debug(message, level))
	debug_container.markdown(
	"\n".join(debug_messages),
	unsafe_allow_html=True
	)

	# Load model with progress update
	with result_col:
	progress_bar.progress(25)
	st.info("Loading model...")

	add_debug(f"Loading {selected_model} model and processor...")
	model, processor = load_model(selected_model)

	if model is None or processor is None:
	with result_col:
	st.error("Failed to load model. Please try again.")
	add_debug("Model loading failed!", "error")
	else:
	add_debug("Model loaded successfully", "success")
	add_debug(f"Model device: {next(model.parameters()).device}")
	add_debug(f"Model memory usage: {torch.cuda.max_memory_allocated()/1024**2:.2f}MB") if torch.cuda.is_available() else None

	# Update progress
	with result_col:
	progress_bar.progress(50)
	st.info("Analyzing document...")

	# Log image details
	add_debug(f"Image size: {image.size}")
	add_debug(f"Image mode: {image.mode}")

	# Analyze document
	add_debug("Starting document analysis...")
	results = analyze_document(image, selected_model, model, processor)
	add_debug("Analysis completed", "success")

	# Update progress
	with result_col:
	progress_bar.progress(75)
	st.markdown("### Analysis Results")

	if isinstance(results, dict) and "error" in results:
	st.error(f"Analysis Error: {results['error']}")
	add_debug(f"Analysis error: {results['error']}", "error")
	else:
	# Pretty print the results in results column
	st.json(results)

	# Show detailed results breakdown in debug column
	add_debug("Results breakdown:", "info")
	if isinstance(results, dict):
	for key, value in results.items():
	add_debug(f"- {key}: {type(value)}")
	else:
	add_debug(f"Result type: {type(results)}")

	# Complete progress
	progress_bar.progress(100)
	st.success("Analysis completed!")

	# Final debug info
	add_debug("Process completed successfully", "success")
	with debug_col:
	if torch.cuda.is_available():
	st.markdown("### Resource Usage")
	st.markdown(f"""
	- GPU Memory: {torch.cuda.max_memory_allocated()/1024**2:.2f}MB
	- GPU Utilization: {torch.cuda.utilization()}%
	""")

	except Exception as e:
	with result_col:
	st.error(f"Error during analysis: {str(e)}")
	add_debug(f"Error: {str(e)}", "error")
	add_debug(f"Error type: {type(e)}", "error")
	if hasattr(e, '__traceback__'):
	add_debug("Traceback available in logs", "warning")

	# Add improved information about usage and limitations
	st.markdown("""
	---
	### Usage Notes:
	- Different models excel at different types of documents
	- Processing time and memory requirements vary by model
	- Image quality significantly affects results
	- Some models may require specific document formats
	""")

	# Add performance metrics section

	if st.checkbox("Show Performance Metrics"):
	st.markdown("""
	### Model Performance Metrics
	\| Model \| Avg. Processing Time \| Memory Usage \| Accuracy* \|
	\|-------\|---------------------\|--------------\|-----------\|
	\| Donut \| 2-3 seconds \| 6-8GB \| 85-90% \|
	\| LayoutLMv3 \| 3-4 seconds \| 12-15GB \| 88-93% \|
	\| OmniParser \| 2-3 seconds \| 8-10GB \| 85-90% \|

	*Accuracy varies based on document type and quality
	""")

	# Add a footer with version and contact information
	st.markdown("---")
	st.markdown("""
	v1.1 - Created with Streamlit
	\nPowered by Hugging Face Spaces 🤗
	""")

	# Add model selection guidance
	if st.checkbox("Show Model Selection Guide"):
	st.markdown("""
	### How to Choose the Right Model
	1. Donut: Choose for structured documents with clear layouts
	2. LayoutLMv3: Best for documents with complex layouts and relationships
	3. OmniParser: Best for UI elements and screen parsing
	""")