# import gradio as gr # from transformers import ( # Blip2Processor, # Blip2ForConditionalGeneration, # AutoTokenizer, # AutoModelForCausalLM, # ) # from PIL import Image # import torch # # Set device # device = "cuda" if torch.cuda.is_available() else "cpu" # # Load image captioning model (BLIP-2) # processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") # blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").to(device) # # Load text generation model (LLM) # llm_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") # llm_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2").to(device) # # Step 1: Generate image caption # def extract_caption(image): # inputs = processor(images=image, return_tensors="pt").to(device) # outputs = blip_model.generate(**inputs, max_new_tokens=50) # caption = processor.tokenizer.decode(outputs[0], skip_special_tokens=True) # return caption # # Step 2: Build fairytale prompt # def build_prompt(caption): # return ( # f"Based on the image description: \"{caption}\", write a children's fairytale.\n" # "The story must:\n" # "- Start with 'Once upon a time'\n" # "- Be at least 10 full sentences long\n" # "- Include named characters, a clear setting, emotions, a challenge, and a resolution\n" # "- Avoid mentions of babies or unrelated royalty unless relevant\n" # "Here is the story:\nOnce upon a time" # ) # # Step 3: Generate story # def generate_fairytale(image): # caption = extract_caption(image) # prompt = build_prompt(caption) # inputs = llm_tokenizer(prompt, return_tensors="pt").to(device) # output = llm_model.generate( # **inputs, # max_new_tokens=500, # do_sample=True, # temperature=0.9, # top_p=0.95, # pad_token_id=llm_tokenizer.eos_token_id # ) # result = llm_tokenizer.decode(output[0], skip_special_tokens=True) # # Trim to only the story # if "Once upon a time" in result: # return "Once upon a time" + result.split("Once upon a time", 1)[-1].strip() # else: # return f"โš ๏ธ Failed to generate story.\n\n[Prompt]\n{prompt}\n\n[Output]\n{result}" # # Gradio interface # with gr.Blocks() as demo: # gr.Markdown("## ๐Ÿ“– AI Fairytale Generator\nUpload an image and get a magical story!") # with gr.Row(): # image_input = gr.Image(type="pil", label="Upload an image") # with gr.Row(): # generate_button = gr.Button("โœจ Generate Fairytale") # with gr.Row(): # output_text = gr.Textbox(label="Generated Story", lines=20) # generate_button.click(fn=generate_fairytale, inputs=[image_input], outputs=[output_text]) # if __name__ == "__main__": # demo.launch(share=True) import gradio as gr from transformers import ( Blip2Processor, Blip2ForConditionalGeneration, AutoTokenizer, AutoModelForCausalLM, ) from PIL import Image import torch # ๋””๋ฐ”์ด์Šค ์„ค์ • (Mistral์€ CPU๋กœ ๊ฐ•์ œ ์„ค์ •) device = "cpu" # 1๏ธโƒฃ ์ด๋ฏธ์ง€ ์„ค๋ช… ์ƒ์„ฑ ๋ชจ๋ธ ๋กœ๋“œ (BLIP-2) processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl") blip_model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl").to(device) # 2๏ธโƒฃ ๋™ํ™” ์ƒ์„ฑ ๋ชจ๋ธ ๋กœ๋“œ (Mistral-7B, CPU) llm_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") llm_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device_map="cpu") # 3๏ธโƒฃ ์ด๋ฏธ์ง€ โ†’ ์„ค๋ช… def extract_caption(image): inputs = processor(images=image, return_tensors="pt").to(device) outputs = blip_model.generate(**inputs, max_new_tokens=50) caption = processor.tokenizer.decode(outputs[0], skip_special_tokens=True).strip() return caption # 4๏ธโƒฃ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ def build_prompt(caption): return ( f"Based on the image description: \"{caption}\", write a children's fairytale.\n" "The story must:\n" "- Start with 'Once upon a time'\n" "- Be at least 10 full sentences long\n" "- Include named characters, a clear setting, emotions, a challenge, and a resolution\n" "- Avoid unrelated royalty or babies unless relevant\n" "Here is the story:\nOnce upon a time" ) # 5๏ธโƒฃ ์ „์ฒด ๋™ํ™” ์ƒ์„ฑ def generate_fairytale(image): caption = extract_caption(image) prompt = build_prompt(caption) inputs = llm_tokenizer(prompt, return_tensors="pt").to(device) output = llm_model.generate( **inputs, max_new_tokens=500, do_sample=True, temperature=0.9, top_p=0.95, pad_token_id=llm_tokenizer.eos_token_id ) result = llm_tokenizer.decode(output[0], skip_special_tokens=True) # "Once upon a time" ์ดํ›„๋งŒ ์ถ”์ถœ if "Once upon a time" in result: return caption, "Once upon a time" + result.split("Once upon a time", 1)[-1].strip() else: return caption, f"โš ๏ธ Story generation failed.\n\n[Prompt]\n{prompt}\n\n[Output]\n{result}" # 6๏ธโƒฃ Gradio UI with gr.Blocks() as demo: gr.Markdown("## ๐Ÿงš AI Fairytale Generator (Mistral CPU ver.)") gr.Markdown("Upload an image and receive a magical children's fairytale based on it โœจ") image_input = gr.Image(type="pil", label="๐Ÿ–ผ๏ธ Upload an Image") generate_button = gr.Button("โœจ Generate Fairytale") caption_output = gr.Textbox(label="๐Ÿ“Œ Image Description", lines=2) story_output = gr.Textbox(label="๐Ÿ“– Generated Fairytale", lines=20) generate_button.click(fn=generate_fairytale, inputs=[image_input], outputs=[caption_output, story_output]) if __name__ == "__main__": demo.launch(share=True)