Spaces:
Running
Running
| import gradio as gr | |
| import os | |
| import pycurl | |
| import io | |
| import json | |
| import shutil | |
| import tempfile | |
| import certifi | |
| from concurrent.futures import ThreadPoolExecutor | |
| from pathlib import Path | |
| from typing import Optional, List | |
| from zipfile import ZipFile | |
| IIIF_URL = "https://lbiiif.riksarkivet.se" | |
| def perform_curl_request(url: str) -> bytes: | |
| """Fast curl request with proper headers to avoid timeouts""" | |
| buffer = io.BytesIO() | |
| c = pycurl.Curl() | |
| try: | |
| # Critical settings that fix the slow connection issue | |
| c.setopt(c.URL, url) | |
| c.setopt(c.WRITEDATA, buffer) | |
| c.setopt(c.CAINFO, certifi.where()) | |
| c.setopt(c.FOLLOWLOCATION, 1) | |
| c.setopt(c.MAXREDIRS, 5) | |
| c.setopt(c.CONNECTTIMEOUT, 5) | |
| c.setopt(c.TIMEOUT, 10) | |
| c.setopt(c.NOSIGNAL, 1) | |
| # These headers are crucial for the Swedish National Archives server | |
| c.setopt(c.HTTPHEADER, [ | |
| 'User-Agent: curl/8.7.1', | |
| 'Accept: */*', | |
| 'Connection: close' # This prevents hanging connections! | |
| ]) | |
| c.perform() | |
| http_code = c.getinfo(c.RESPONSE_CODE) | |
| if http_code != 200: | |
| raise Exception(f"HTTP {http_code}") | |
| return buffer.getvalue() | |
| finally: | |
| c.close() | |
| def get_image_ids(batch_id: str) -> List[str]: | |
| """Fetch image IDs from IIIF manifest using fast pycurl""" | |
| print(f"Fetching manifest for batch: {batch_id}") | |
| manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest" | |
| print(f"Manifest URL: {manifest_url}") | |
| try: | |
| response_data = perform_curl_request(manifest_url) | |
| manifest = json.loads(response_data.decode('utf-8')) | |
| except Exception as e: | |
| raise ValueError(f"Failed to fetch manifest: {e}") | |
| image_ids = [] | |
| for item in manifest.get("items", []): | |
| id_parts = item["id"].split("!") | |
| if len(id_parts) > 1: | |
| image_id = id_parts[1][:14] | |
| image_ids.append(image_id) | |
| if not image_ids: | |
| raise ValueError(f"No images found in manifest for batch {batch_id}") | |
| print(f"Found {len(image_ids)} images in batch {batch_id}") | |
| return image_ids | |
| def download_image_pycurl(url: str, dest: str) -> bool: | |
| """Download a single image using pycurl for speed""" | |
| try: | |
| image_data = perform_curl_request(url) | |
| with open(dest, "wb") as f: | |
| f.write(image_data) | |
| print(f"✓ Downloaded: {Path(dest).name}") | |
| return True | |
| except Exception as e: | |
| print(f"✗ Failed to download {Path(dest).name}: {e}") | |
| return False | |
| def iiif_download_batch( | |
| batch_id: str, | |
| start_index: int = 1, | |
| end_index: Optional[int] = None, | |
| max_workers: int = 10 # Increased from 5 for faster downloads | |
| ) -> Optional[str]: | |
| """ | |
| Download images from an IIIF batch using fast pycurl and return as zip. | |
| Args: | |
| batch_id: The batch/manifest ID to download | |
| start_index: Starting image number (1-based) | |
| end_index: Ending image number (inclusive). None = download all | |
| max_workers: Number of concurrent downloads (default 10) | |
| Returns: | |
| Path to zip file for download or None if failed | |
| """ | |
| print(f"\n=== Starting Fast IIIF Download ===") | |
| print(f"Batch ID: {batch_id}") | |
| print(f"Start index: {start_index}") | |
| print(f"End index: {end_index}") | |
| print(f"Workers: {max_workers}") | |
| try: | |
| # Handle None or empty batch_id gracefully | |
| if not batch_id: | |
| print("Warning: No batch ID provided") | |
| return None | |
| all_image_ids = get_image_ids(batch_id) | |
| total_images = len(all_image_ids) | |
| start_idx = max(0, start_index - 1) | |
| end_idx = end_index if end_index else total_images | |
| end_idx = min(end_idx, total_images) | |
| image_ids = all_image_ids[start_idx:end_idx] | |
| if not image_ids: | |
| raise ValueError(f"No images in specified range {start_index}-{end_index}") | |
| print(f"Downloading {len(image_ids)} images (range: {start_index}-{end_idx} of {total_images} total)") | |
| temp_dir = Path(tempfile.mkdtemp()) | |
| batch_dir = temp_dir / batch_id | |
| batch_dir.mkdir(exist_ok=True) | |
| print(f"Temp directory: {temp_dir}") | |
| # Download images concurrently with pycurl | |
| def download_single(image_id: str): | |
| url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg" | |
| dest = batch_dir / f"{image_id}.jpg" | |
| success = download_image_pycurl(url, str(dest)) | |
| return image_id, success | |
| downloaded = [] | |
| failed = [] | |
| print(f"Starting concurrent downloads ({max_workers} workers)...") | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| results = executor.map(download_single, image_ids) | |
| for image_id, success in results: | |
| if success: | |
| downloaded.append(image_id) | |
| else: | |
| failed.append(image_id) | |
| if not downloaded: | |
| print("No images were successfully downloaded") | |
| return None | |
| # Create zip file | |
| range_suffix = f"_{start_index}-{end_idx}" if end_index else "_all" | |
| zip_path = temp_dir / f"{batch_id}{range_suffix}.zip" | |
| print(f"Creating zip file: {zip_path.name}") | |
| with ZipFile(zip_path, 'w') as zipf: | |
| for image_id in downloaded: | |
| img_path = batch_dir / f"{image_id}.jpg" | |
| if img_path.exists(): | |
| zipf.write(img_path, arcname=f"{image_id}.jpg") | |
| print(f"✓ Success! Downloaded {len(downloaded)}/{len(image_ids)} images") | |
| if failed: | |
| print(f"⚠ Failed downloads: {len(failed)} images") | |
| print(f"Zip file created: {zip_path}") | |
| print("=== Download Complete ===\n") | |
| return str(zip_path) | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| print("=== Download Failed ===\n") | |
| return None | |
| # Create Gradio interface | |
| print("Creating Gradio interface...") | |
| demo = gr.Interface( | |
| fn=iiif_download_batch, | |
| inputs=[ | |
| gr.Textbox( | |
| label="Batch ID", | |
| placeholder="Enter 8-digit batch ID (e.g., C0000263)", | |
| info="The IIIF manifest/batch identifier" | |
| ), | |
| gr.Number( | |
| label="Start Image", | |
| value=1, | |
| minimum=1, | |
| precision=0, | |
| info="First image to download (1 = first image)" | |
| ), | |
| gr.Number( | |
| label="End Image (Optional)", | |
| value=None, | |
| minimum=1, | |
| precision=0, | |
| info="Last image to download (leave empty for all)" | |
| ) | |
| ], | |
| outputs=gr.File(label="Download Zip"), | |
| title="Fast IIIF Batch Downloader (PycURL)", | |
| description="High-speed downloader for Swedish National Archives IIIF manifests using optimized PycURL requests.", | |
| examples=[ | |
| ["R0001210", 1, 10], # Real batch - Download first 10 | |
| ["R0001210", 11, 20], # Real batch - Download images 11-20 | |
| ["R0001210", 1, 50], # Real batch - Download first 50 | |
| ], | |
| cache_examples=False, | |
| api_name="iiif_download_batch" | |
| ) | |
| if __name__ == "__main__": | |
| print("Launching Gradio app with MCP server enabled...") | |
| print("Note: Make sure pycurl and certifi are installed:") | |
| print(" pip install pycurl certifi") | |
| demo.launch( | |
| mcp_server=True, | |
| share=False, | |
| debug=False, | |
| ssr_mode=False | |
| ) |