Code-Test-generator / pdf_utils.py
Jaiwincr7
FINAL FIX: Removed redundant 'bytes()' conversion in app.py to correct PDF data type mismatch.
112effb
raw
history blame
4.47 kB
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
from fpdf import FPDF
import re
# --- MODEL SETUP ---
model_id = "deepseek-ai/deepseek-coder-1.3b-instruct"
# Load model and tokenizer once (Runs when main.py is imported)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.float16,
device_map="auto",
offload_folder="./offload"
)
# Wrap Transformers pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(
pipeline=pipe,
model_kwargs={
"max_new_tokens": 4096,
"do_sample": True,
"temperature": 0.2,
"repetition_penalty": 1.05,
"eos_token_id": tokenizer.eos_token_id,
}
)
# Define the Prompt Template (used by the test_case function)
test_prompt = ChatPromptTemplate.from_messages(
[
(
"system",
"""You are an expert QA engineer.
STRICTLY follow these rules for your output:
- Generate EXACTLY 10 numbered test cases (1–5 functional, 6–10 edge cases).
- Output ONLY the numbered list.
- DO NOT include explanations, headers, filler text, or markdown.
- Each test MUST be a single, concise sentence.
- Begin your response immediately with '1. '""",
),
(
"user",
"Generate test cases for the following code:\n{code}",
),
]
)
# --- TEST CASE GENERATION FUNCTION ---
def test_case(code):
# FIX: Define and INVOKE the test_chain to resolve UnboundLocalError
test_chain = test_prompt | llm | StrOutputParser()
test_cases = test_chain.invoke({"code": code})
print("\n[LOG] 1. LLM Raw Output Length:", len(test_cases))
# Aggressive cleaning (Removes markdown blocks and standalone markers)
test_cases = re.sub(r"```.*?```", "", test_cases, flags=re.DOTALL)
test_cases = re.sub(r"```", "", test_cases)
test_cases = test_cases.strip()
# Guardrail: If the LLM returns nothing, force a known output
if not test_cases:
test_cases = "Error: Test case generation failed or returned empty content."
print("\n[LOG] 2. Cleaned Text (for PDF):", test_cases)
# Pass the clean UTF-8 string directly
safe_text = test_cases
pdf = FPDF()
pdf.add_page()
# FPDF FIX: Add and use a Unicode-compatible font (DejaVuSans)
try:
# Requires 'fonts-dejavu' package installed in Dockerfile
pdf.add_font("DejaVuSans", style="", fname="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf")
pdf.set_font("DejaVuSans", size=12)
except Exception as e:
print(f"[LOG] FPDF Font Error: {e}. Falling back to Arial.")
pdf.set_font("Arial", size=12)
# Set title and content cell
pdf.multi_cell(0, 10, txt="--- Generated Test Cases ---", align='C')
pdf.multi_cell(0, 10, txt=safe_text)
# CRITICAL CHANGE: Get bytes object directly from FPDF2 with error handling
try:
pdf_string = pdf.output(dest='S')
# Use 'replace' to safely handle any non-latin-1 characters FPDF might leave
pdf_bytes = pdf_string.encode('latin-1', 'replace')
print("\n[LOG] 3. PDF Bytes Length:", len(pdf_bytes))
# Check to ensure the file is not empty (i.e., less than a blank document)
if len(pdf_bytes) < 100:
print("[CRITICAL LOG] FPDF generated very small file, likely failed.")
# Fallback PDF: Creates a new PDF with the error message
error_pdf = FPDF()
error_pdf.add_page()
error_pdf.set_font("Arial", size=12)
error_pdf.multi_cell(0, 10, txt="ERROR: PDF generation failed to create content. Check logs.")
return error_pdf.output(dest='S').encode('latin-1')
return pdf_bytes
except Exception as e:
print(f"[FATAL LOG] PDF output failed with error: {e}")
# Fatal Fallback PDF: Creates a new PDF with the fatal error message
error_pdf = FPDF()
error_pdf.add_page()
error_pdf.set_font("Arial", size=12)
error_pdf.multi_cell(0, 10, txt=f"FATAL ERROR: PDF generation crashed. Reason: {e}")
return error_pdf.output(dest='S').encode('latin-1')