import os import sys import time import base64 from io import BytesIO from pathlib import Path """ Generate one AI illustration per slide concept using OpenAI Images API. Outputs: docs/slide_illustrations/slide-1.png ... slide-5.png Requires: OPENAI_API_KEY in environment and internet access. """ SLIDE_PROMPTS = [ ( 1, "Consistent Output Control", ( "Minimalist flat-design infographic showing a flow from a user message to an agent " "to a JSON schema validator producing a consistent structured JSON output. Use iconography only, no text. " "Elements: speech bubble -> robot head -> curly-brace JSON icon with checkmark badge. " "Style: clean, vector, blue/teal color palette, high contrast, centered composition." ), ), ( 2, "Input Control via Missing Fields", ( "Modern UI concept art of a form with several fields, where required fields are highlighted and missing fields are flagged. " "An assistant bubble points to the missing fields to ask clarifying questions. No text labels. " "Style: product illustration, rounded cards, subtle shadows, blue/orange highlights, vector." ), ), ( 3, "Dynamic Input Schemas (KB & Internet)", ( "Diagram of an agent deciding between two paths: a knowledge base database icon and an internet globe icon. " "Branched arrows from the agent to each tool, then back to a combined result. Iconography only. " "Style: sleek tech infographic, gradient accents, minimal lines, no words." ), ), ( 4, "Multistep Execution & Delegation", ( "An orchestrator node delegating tasks to multiple sub-agents in sequence. " "Show numbered or visually ordered steps without using text: use small numbered badges or dots. " "Style: systems diagram, monochrome with one accent color, clean vectors, no text." ), ), ( 5, "API Calls (NCBI/Web Search)", ( "Magnifying glass over a DNA helix icon next to a web globe, representing API calls to biomedical and web search. " "Arrows indicate request and response. Iconography only, no text. " "Style: scientific-tech aesthetic, cool tones, crisp vector illustration." ), ), ] def ensure_api_key() -> str: key = os.getenv("OPENAI_API_KEY") if not key: print("ERROR: OPENAI_API_KEY not set in environment. Set it and rerun.") print("PowerShell example:") print(" $Env:OPENAI_API_KEY = \"sk-...\"") sys.exit(1) return key def main(): # Initialize OpenAI client if available openai_enabled = False openai_client = None if os.getenv("OPENAI_API_KEY"): try: from openai import OpenAI # type: ignore openai_client = OpenAI() openai_enabled = True except Exception as e: print("WARNING: OpenAI client unavailable; will try Hugging Face fallback.") print(f"Details: {e}\n") # Initialize Hugging Face client if available hf_enabled = False hf_client = None hf_model = os.getenv("HF_IMAGE_MODEL", "stabilityai/stable-diffusion-xl-base-1.0") if os.getenv("HUGGINGFACE_API_TOKEN"): try: from huggingface_hub import InferenceClient # type: ignore hf_client = InferenceClient(token=os.getenv("HUGGINGFACE_API_TOKEN")) hf_enabled = True except Exception as e: print("WARNING: huggingface_hub not available. Install with: pip install huggingface_hub pillow") print(f"Details: {e}\n") project_root = Path(__file__).resolve().parents[1] out_dir = project_root / "docs" / "slide_illustrations" out_dir.mkdir(parents=True, exist_ok=True) def enhance_prompt_with_gpt4(title: str, concept: str) -> str: """Use GPT-4o to expand the concept into a professional DALL-E 3 prompt.""" if not openai_enabled or openai_client is None: return concept # fallback to original try: system_prompt = ( "You are an expert at writing DALL-E 3 prompts for professional technical illustrations. " "Given a slide title and concept, expand it into a detailed, specific prompt that will produce " "a high-quality, structured infographic-style illustration. Focus on: clean composition, professional " "design, iconography without text labels, consistent color palette, and visual hierarchy. " "Return ONLY the enhanced prompt, no explanations." ) user_prompt = f"Slide title: {title}\n\nConcept: {concept}\n\nEnhanced DALL-E 3 prompt:" resp = openai_client.chat.completions.create( model="gpt-4o", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], temperature=0.7, max_tokens=300, ) enhanced = resp.choices[0].message.content or concept return enhanced.strip() except Exception as e: print(f" Prompt enhancement failed, using original: {e}") return concept def try_openai_responses_api(prompt: str): """Try gpt-image-1 via Responses API (newest, highest quality).""" if not openai_enabled or openai_client is None: return None try: response = openai_client.responses.create( model="gpt-4.1-mini", # Use a model that supports image generation input=prompt, tools=[{"type": "image_generation"}], ) # Extract image data from response image_data = [ output.result for output in response.output if output.type == "image_generation_call" ] if image_data: return base64.b64decode(image_data[0]) return None except Exception as e: print(f" Responses API (gpt-image-1) failed: {e}") return None def try_openai(prompt: str): """Fallback: DALL-E 3 via Images API in landscape mode.""" if not openai_enabled or openai_client is None: return None try: resp = openai_client.images.generate( model="dall-e-3", prompt=prompt, size="1792x1024", # Landscape for slides quality="hd", response_format="b64_json", ) b64 = resp.data[0].b64_json if not b64: return None return base64.b64decode(b64) except Exception as e: print(f" DALL-E 3 generation failed: {e}") return None def try_hf(prompt: str): if not hf_enabled or hf_client is None: return None try: # text_to_image returns a PIL.Image.Image img = hf_client.text_to_image(prompt=prompt, model=hf_model) bio = BytesIO() img = img.convert("RGB") # ensure 3-channel img.save(bio, format="PNG") return bio.getvalue() except Exception as e: print(f" Hugging Face generation failed: {e}") return None for idx, title, prompt in SLIDE_PROMPTS: print(f"Generating slide {idx}: {title}") # Step 1: Enhance prompt with GPT-4o (mimics ChatGPT's internal process) print(f" Enhancing prompt with GPT-4o...") enhanced_prompt = enhance_prompt_with_gpt4(title, prompt) print(f" Enhanced prompt: {enhanced_prompt[:100]}...") img_bytes = None # Try newest model first (Responses API with gpt-image-1) if openai_enabled: img_bytes = try_openai_responses_api(enhanced_prompt) # Fallback to DALL-E 3 landscape if Responses API unavailable if img_bytes is None and openai_enabled: print(" Falling back to DALL-E 3 (landscape)...") img_bytes = try_openai(enhanced_prompt) # Fallback to Hugging Face if img_bytes is None and hf_enabled: img_bytes = try_hf(enhanced_prompt) if img_bytes is None: print( " Skipped: No image generated. Ensure either OPENAI_API_KEY (with access to gpt-image-1) " "or HUGGINGFACE_API_TOKEN is set." ) continue out_path = out_dir / f"slide-{idx}.png" with open(out_path, "wb") as f: f.write(img_bytes) print(f"Saved {out_path}") time.sleep(0.75) print(f"Done. Illustrations saved to: {out_dir}") if not openai_enabled: print("Note: OPENAI_API_KEY not set or OpenAI client unavailable.") if not hf_enabled: print("Note: HUGGINGFACE_API_TOKEN not set or huggingface_hub unavailable.") if __name__ == "__main__": main()