mcpOptimizer / src /engine.py
anouar-bm's picture
ai
498af49
raw
history blame
4.27 kB
import re
import spacy
import tiktoken
from lemminflect import getLemma
class AdvancedPromptOptimizer:
def __init__(self):
# For NER, consider using en_core_web_md for better accuracy
self.nlp = spacy.load("en_core_web_sm")
self.nlp.Defaults.stop_words -= {"not", "no", "never"}
self.tokenizer = tiktoken.get_encoding("cl100k_base")
self.negation_words = {"not", "no", "never", "without", "except"}
def _mask_spans(self, s):
masks = {}
# triple backticks
s, n = re.subn(r"```.*?```", lambda m: masks.setdefault(f"<CODE{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s, flags=re.S)
# inline code
s = re.sub(r"`[^`]+`", lambda m: masks.setdefault(f"<IC{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s)
# urls
s = re.sub(r"https?://\S+", lambda m: masks.setdefault(f"<URL{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s)
# comparators
s = re.sub(r"\b(less than|at least|no more than)\b", lambda m: masks.setdefault(f"<CMP{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s, flags=re.I)
return s, masks
def _unmask_spans(self, s, masks):
for k, v in masks.items():
s = s.replace(k, v)
return s
def optimize(self, prompt: str, aggressiveness: float = 0.7) -> tuple:
"""Optimize prompt with token counting"""
masked_prompt, masks = self._mask_spans(prompt)
optimized = self._apply_rules(masked_prompt, aggressiveness)
optimized = self._linguistic_optimize(optimized, aggressiveness)
optimized = self._unmask_spans(optimized, masks)
optimized = re.sub(r"\s+", " ", optimized).strip()
try:
orig_tokens = len(self.tokenizer.encode(prompt))
new_tokens = len(self.tokenizer.encode(optimized))
except:
orig_tokens = len(prompt.split())
new_tokens = len(optimized.split())
return optimized, orig_tokens, new_tokens
def _apply_rules(self, text: str, aggressiveness: float) -> str:
# Apply safer rules first
rules = [
(r"\s{2,}", " ", 0.0),
(r"\b(\w+)\s+\1\b", r"\1", 0.0),
(r"\b(advantages and disadvantages)\b", "pros/cons", 0.5),
(r"\b(in a detailed manner|in a detailed way)\b", "", 0.7),
(r"\b(I want to|I need to|I would like to)\b", "", 0.7),
(r"\b(for example|e\.g\.|such as|i\.e\.)\b", "e.g.", 0.8),
(r"\b(please\s+)?(kindly\s+)?(carefully|very|extremely|really|quite)\b", "", 0.8),
(r"\b(can you|could you|would you)\b", "", 0.9),
(r"\b(output|provide|give|return)\s+in\s+(JSON|json)\s+format\b", "JSON:", 1.0),
]
for pattern, repl, priority in rules:
if aggressiveness >= priority:
text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
return text
def _linguistic_optimize(self, text: str, aggressiveness: float) -> str:
if not text.strip():
return text
doc = self.nlp(text)
out = []
for token in doc:
# Guard important labels
if token.text.lower() in ["deliverables:", "constraints:", "metrics:"] and token.is_sent_start:
out.append(token.text)
continue
if token.pos_ in ("PUNCT", "SPACE"): continue
if token.like_num or token.ent_type_ or token.dep_ == "neg" or token.text.lower() in self.negation_words:
out.append(token.text)
continue
if token.pos_ in ("PROPN", "NUM", "NOUN", "ADJ"):
out.append(token.text)
continue
if token.pos_ == "VERB":
if aggressiveness >= 0.8:
lemma = getLemma(token.text, upos="VERB") or [token.lemma_]
out.append(lemma[0])
else:
out.append(token.text)
continue
if token.pos_ in ("ADV", "DET", "PRON"):
if aggressiveness < 0.6:
out.append(token.text)
# else drop
continue
out.append(token.text)
return " ".join(out)