Spaces:

orgoflu
/

moro_mini_llm

Sleeping

App Files Files Community

orgoflu commited on Sep 10

Commit

0b37765

verified ·

1 Parent(s): ab4dfe6

app.py

Browse files

Files changed (1) hide show

app.py +70 -32

app.py CHANGED Viewed

@@ -54,67 +54,105 @@ def chunk_by_tokens(sentences, max_tokens=900):
         chunks.append(" ".join(cur))
     return chunks
 # ===== 요약 =====
-def summarize_raw(text: str, min_len: int, max_len: int) -> str:
-    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=1024)
     with torch.no_grad():
         summary_ids = model.generate(
             input_ids,
-            num_beams=4,
-            min_length=min_len,
-            max_length=max_len,
             early_stopping=True
         )
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
-def apply_style_prompt(text: str, mode: str, final: bool=False) -> str:
     if mode == "concise":
-        inst = "다음 한국어 텍스트를 핵심만 간결하게 요약하세요."
     elif mode == "explanatory":
-        inst = "다음 한국어 텍스트를 맥락을 보존하며 이해하기 쉽게 요약하세요."
     else:
-        inst = "다음 한국어 텍스트를 bullet 형태로 핵심만 요약하세요."
     if final:
-        inst += " 원래 순서를 유지하며 문장 연결을 자연스럽게 하세요."
-    return f"{inst}\n\n{text}"
-def postprocess(summary: str, mode: str) -> str:
     s = summary.strip()
     s = re.sub(r"\s+", " ", s)
     if mode == "bullets":
-        bullets = re.split(r"\s*[-•]\s*", s)
-        bullets = [b.strip() for b in bullets if b.strip()]
-        if len(bullets) > 1:
-            s = "\n".join([f"- {b}" for b in bullets])
-        else:
-            parts = re.split(r"(?<=[\.!?])\s+", s)
-            parts = [p.strip() for p in parts if p.strip()]
-            s = "\n".join([f"- {p}" for p in parts])
     return s
 def summarize_long(text: str, target_chars: int, mode: str):
     text = normalize_text(text)
     if not text:
         return "⚠️ 요약할 텍스트를 입력하세요."
     approx_tokens = token_length(text)
     if approx_tokens <= 1000:
-        min_len = max(60, int(target_chars * 0.4 / 2))
-        max_len = max(120, int(target_chars * 0.8 / 2))
-        return postprocess(summarize_raw(apply_style_prompt(text, mode), min_len, max_len), mode)
     sentences = split_into_sentences(text)
     chunks = chunk_by_tokens(sentences, max_tokens=900)
     partial_summaries = []
-    budget_total = int(target_chars * 1.5)
-    per_chunk_chars = max(250, budget_total // max(1, len(chunks)))
     for c in chunks:
-        min_len = max(50, int(per_chunk_chars * 0.4 / 2))
-        max_len = max(100, int(per_chunk_chars * 0.9 / 2))
-        psum = summarize_raw(apply_style_prompt(c, mode), min_len, max_len)
         partial_summaries.append(psum)
     merged = normalize_text(" ".join(partial_summaries))
-    final_min = max(80, int(target_chars * 0.45 / 2))
-    final_max = max(160, int(target_chars * 1.05 / 2))
-    return postprocess(summarize_raw(apply_style_prompt(merged, mode, final=True), final_min, final_max), mode)
 # ===== Gradio UI =====
 def ui_summarize(text, target_len, style):
@@ -122,7 +160,7 @@ def ui_summarize(text, target_len, style):
     return summarize_long(text, int(target_len), mode)
 with gr.Blocks() as demo:
-    gr.Markdown("## 📝 KoT5 한국어 요약기 (긴 문서 자동 분할 + 순서 보존)")
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(label="원문 입력", lines=16)

         chunks.append(" ".join(cur))
     return chunks
+# ===== 반복 제거 =====
+def derpeat(text: str) -> str:
+    text = re.sub(r'(.)\1{2,}', r'\1\1', text)  # 단일 문자 3회 이상 반복 → 2회
+    text = re.sub(r'(\b\w+\b)(\s+\1){1,}', r'\1', text)  # 단어 반복 제거
+    text = re.sub(r'([\.!?\-~])\1{2,}', r'\1\1', text)  # 구두점 반복 축소
+    return text.strip()
 # ===== 요약 =====
+def approx_tokens_from_chars(n_chars: int) -> int:
+    return max(1, n_chars // 2)  # 한글 대략 1토큰 ≈ 2문자
+def summarize_raw_t5(input_text: str, target_chars: int, input_tokens: int) -> str:
+    safe_target_chars = min(target_chars, max(120, int(len(input_text) * 0.9)))
+    max_new = max(40, min(approx_tokens_from_chars(safe_target_chars), 300))
+    if input_tokens <= 200:
+        max_new = min(max_new, max(40, int(input_tokens * 0.6)))
+    if input_tokens <= 60:
+        max_new = min(max_new, 60)
+    input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, max_length=1024)
     with torch.no_grad():
         summary_ids = model.generate(
             input_ids,
+            max_new_tokens=max_new,
+            do_sample=True,
+            top_p=0.92,
+            temperature=0.7,
+            num_beams=1,
+            no_repeat_ngram_size=4,
+            encoder_no_repeat_ngram_size=4,
+            repetition_penalty=1.2,
+            renormalize_logits=True,
             early_stopping=True
         )
     return tokenizer.decode(summary_ids[0], skip_special_tokens=True)
+def apply_style_prompt_t5(text: str, mode: str, final: bool=False) -> str:
     if mode == "concise":
+        tag = "간결 요약:"
     elif mode == "explanatory":
+        tag = "설명 요약:"
     else:
+        tag = "불릿 요약:"
+    guide = ""
     if final:
+        guide = " (원래 문서의 순서를 유지하고 중복을 제거하세요.)"
+    return f"{tag}{guide}\n{text}"
+def postprocess_strict(summary: str, mode: str) -> str:
     s = summary.strip()
     s = re.sub(r"\s+", " ", s)
+    s = derpeat(s)
+    seen, outs = set(), []
+    for sent in re.split(r"(?<=[\.!?])\s+", s):
+        ss = sent.strip()
+        if ss and ss not in seen:
+            seen.add(ss)
+            outs.append(ss)
+    s = " ".join(outs)
     if mode == "bullets":
+        parts = [p for p in outs if p]
+        s = "\n".join([f"- {p}" for p in parts[:12]])
     return s
 def summarize_long(text: str, target_chars: int, mode: str):
     text = normalize_text(text)
     if not text:
         return "⚠️ 요약할 텍스트를 입력하세요."
     approx_tokens = token_length(text)
+    if approx_tokens <= 60:
+        prompt = apply_style_prompt_t5(text, mode, final=False)
+        out = summarize_raw_t5(prompt, min(target_chars, 300), approx_tokens)
+        return postprocess_strict(out, mode)
     if approx_tokens <= 1000:
+        prompt = apply_style_prompt_t5(text, mode, final=False)
+        out = summarize_raw_t5(prompt, target_chars, approx_tokens)
+        return postprocess_strict(out, mode)
     sentences = split_into_sentences(text)
     chunks = chunk_by_tokens(sentences, max_tokens=900)
     partial_summaries = []
+    per_chunk_chars = max(180, int(target_chars * 1.2 / max(1, len(chunks))))
     for c in chunks:
+        prompt = apply_style_prompt_t5(c, mode, final=False)
+        psum = summarize_raw_t5(prompt, per_chunk_chars, token_length(c))
         partial_summaries.append(psum)
     merged = normalize_text(" ".join(partial_summaries))
+    merged = derpeat(merged)
+    final_prompt = apply_style_prompt_t5(merged, mode, final=True)
+    final = summarize_raw_t5(final_prompt, target_chars, token_length(merged))
+    return postprocess_strict(final, mode)
 # ===== Gradio UI =====
 def ui_summarize(text, target_len, style):
     return summarize_long(text, int(target_len), mode)
 with gr.Blocks() as demo:
+    gr.Markdown("## 📝 KoT5 한국어 요약기 (반복 억제 + 순서 보존)")
     with gr.Row():
         with gr.Column():
             input_text = gr.Textbox(label="원문 입력", lines=16)