Spaces:

Riksarkivet
/

iiif_downloader

Running

App Files Files Community

iiif_downloader / app.py

Gabriel

Update app.py

4c2f5c0 verified about 2 months ago

raw

history blame contribute delete

7.74 kB

	import gradio as gr
	import os
	import pycurl
	import io
	import json
	import shutil
	import tempfile
	import certifi
	from concurrent.futures import ThreadPoolExecutor
	from pathlib import Path
	from typing import Optional, List
	from zipfile import ZipFile


	IIIF_URL = "https://lbiiif.riksarkivet.se"

	def perform_curl_request(url: str) -> bytes:
	"""Fast curl request with proper headers to avoid timeouts"""
	buffer = io.BytesIO()
	c = pycurl.Curl()

	try:
	# Critical settings that fix the slow connection issue
	c.setopt(c.URL, url)
	c.setopt(c.WRITEDATA, buffer)
	c.setopt(c.CAINFO, certifi.where())
	c.setopt(c.FOLLOWLOCATION, 1)
	c.setopt(c.MAXREDIRS, 5)
	c.setopt(c.CONNECTTIMEOUT, 5)
	c.setopt(c.TIMEOUT, 10)
	c.setopt(c.NOSIGNAL, 1)

	# These headers are crucial for the Swedish National Archives server
	c.setopt(c.HTTPHEADER, [
	'User-Agent: curl/8.7.1',
	'Accept: /',
	'Connection: close' # This prevents hanging connections!
	])

	c.perform()

	http_code = c.getinfo(c.RESPONSE_CODE)
	if http_code != 200:
	raise Exception(f"HTTP {http_code}")

	return buffer.getvalue()
	finally:
	c.close()

	def get_image_ids(batch_id: str) -> List[str]:
	"""Fetch image IDs from IIIF manifest using fast pycurl"""
	print(f"Fetching manifest for batch: {batch_id}")

	manifest_url = f"{IIIF_URL}/arkis!{batch_id}/manifest"
	print(f"Manifest URL: {manifest_url}")

	try:
	response_data = perform_curl_request(manifest_url)
	manifest = json.loads(response_data.decode('utf-8'))
	except Exception as e:
	raise ValueError(f"Failed to fetch manifest: {e}")

	image_ids = []

	for item in manifest.get("items", []):
	id_parts = item["id"].split("!")
	if len(id_parts) > 1:
	image_id = id_parts[1][:14]
	image_ids.append(image_id)

	if not image_ids:
	raise ValueError(f"No images found in manifest for batch {batch_id}")

	print(f"Found {len(image_ids)} images in batch {batch_id}")
	return image_ids

	def download_image_pycurl(url: str, dest: str) -> bool:
	"""Download a single image using pycurl for speed"""
	try:
	image_data = perform_curl_request(url)

	with open(dest, "wb") as f:
	f.write(image_data)

	print(f"✓ Downloaded: {Path(dest).name}")
	return True
	except Exception as e:
	print(f"✗ Failed to download {Path(dest).name}: {e}")
	return False

	def iiif_download_batch(
	batch_id: str,
	start_index: int = 1,
	end_index: Optional[int] = None,
	max_workers: int = 10 # Increased from 5 for faster downloads
	) -> Optional[str]:
	"""
	Download images from an IIIF batch using fast pycurl and return as zip.

	Args:
	batch_id: The batch/manifest ID to download
	start_index: Starting image number (1-based)
	end_index: Ending image number (inclusive). None = download all
	max_workers: Number of concurrent downloads (default 10)

	Returns:
	Path to zip file for download or None if failed
	"""
	print(f"\n=== Starting Fast IIIF Download ===")
	print(f"Batch ID: {batch_id}")
	print(f"Start index: {start_index}")
	print(f"End index: {end_index}")
	print(f"Workers: {max_workers}")

	try:
	# Handle None or empty batch_id gracefully
	if not batch_id:
	print("Warning: No batch ID provided")
	return None

	all_image_ids = get_image_ids(batch_id)
	total_images = len(all_image_ids)

	start_idx = max(0, start_index - 1)
	end_idx = end_index if end_index else total_images
	end_idx = min(end_idx, total_images)

	image_ids = all_image_ids[start_idx:end_idx]

	if not image_ids:
	raise ValueError(f"No images in specified range {start_index}-{end_index}")

	print(f"Downloading {len(image_ids)} images (range: {start_index}-{end_idx} of {total_images} total)")

	temp_dir = Path(tempfile.mkdtemp())
	batch_dir = temp_dir / batch_id
	batch_dir.mkdir(exist_ok=True)
	print(f"Temp directory: {temp_dir}")

	# Download images concurrently with pycurl
	def download_single(image_id: str):
	url = f"{IIIF_URL}/arkis!{image_id}/full/max/0/default.jpg"
	dest = batch_dir / f"{image_id}.jpg"
	success = download_image_pycurl(url, str(dest))
	return image_id, success

	downloaded = []
	failed = []

	print(f"Starting concurrent downloads ({max_workers} workers)...")
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	results = executor.map(download_single, image_ids)

	for image_id, success in results:
	if success:
	downloaded.append(image_id)
	else:
	failed.append(image_id)

	if not downloaded:
	print("No images were successfully downloaded")
	return None

	# Create zip file
	range_suffix = f"_{start_index}-{end_idx}" if end_index else "_all"
	zip_path = temp_dir / f"{batch_id}{range_suffix}.zip"

	print(f"Creating zip file: {zip_path.name}")
	with ZipFile(zip_path, 'w') as zipf:
	for image_id in downloaded:
	img_path = batch_dir / f"{image_id}.jpg"
	if img_path.exists():
	zipf.write(img_path, arcname=f"{image_id}.jpg")

	print(f"✓ Success! Downloaded {len(downloaded)}/{len(image_ids)} images")
	if failed:
	print(f"⚠ Failed downloads: {len(failed)} images")
	print(f"Zip file created: {zip_path}")
	print("=== Download Complete ===\n")

	return str(zip_path)

	except Exception as e:
	print(f"ERROR: {e}")
	print("=== Download Failed ===\n")
	return None


	# Create Gradio interface
	print("Creating Gradio interface...")
	demo = gr.Interface(
	fn=iiif_download_batch,
	inputs=[
	gr.Textbox(
	label="Batch ID",
	placeholder="Enter 8-digit batch ID (e.g., C0000263)",
	info="The IIIF manifest/batch identifier"
	),
	gr.Number(
	label="Start Image",
	value=1,
	minimum=1,
	precision=0,
	info="First image to download (1 = first image)"
	),
	gr.Number(
	label="End Image (Optional)",
	value=None,
	minimum=1,
	precision=0,
	info="Last image to download (leave empty for all)"
	)
	],
	outputs=gr.File(label="Download Zip"),
	title="Fast IIIF Batch Downloader (PycURL)",
	description="High-speed downloader for Swedish National Archives IIIF manifests using optimized PycURL requests.",
	examples=[
	["R0001210", 1, 10], # Real batch - Download first 10
	["R0001210", 11, 20], # Real batch - Download images 11-20
	["R0001210", 1, 50], # Real batch - Download first 50
	],
	cache_examples=False,
	api_name="iiif_download_batch"
	)

	if __name__ == "__main__":
	print("Launching Gradio app with MCP server enabled...")
	print("Note: Make sure pycurl and certifi are installed:")
	print(" pip install pycurl certifi")
	demo.launch(
	mcp_server=True,
	share=False,
	debug=False,
	ssr_mode=False
	)