Spaces:
Build error
Build error
| import re | |
| import spacy | |
| import tiktoken | |
| from lemminflect import getLemma | |
| class AdvancedPromptOptimizer: | |
| def __init__(self): | |
| # For NER, consider using en_core_web_md for better accuracy | |
| self.nlp = spacy.load("en_core_web_sm") | |
| self.nlp.Defaults.stop_words -= {"not", "no", "never"} | |
| self.tokenizer = tiktoken.get_encoding("cl100k_base") | |
| self.negation_words = {"not", "no", "never", "without", "except"} | |
| def _mask_spans(self, s): | |
| masks = {} | |
| # triple backticks | |
| s, n = re.subn(r"```.*?```", lambda m: masks.setdefault(f"<CODE{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s, flags=re.S) | |
| # inline code | |
| s = re.sub(r"`[^`]+`", lambda m: masks.setdefault(f"<IC{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s) | |
| # urls | |
| s = re.sub(r"https?://\S+", lambda m: masks.setdefault(f"<URL{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s) | |
| # comparators | |
| s = re.sub(r"\b(less than|at least|no more than)\b", lambda m: masks.setdefault(f"<CMP{len(masks)}>", m.group(0)) or list(masks.keys())[-1], s, flags=re.I) | |
| return s, masks | |
| def _unmask_spans(self, s, masks): | |
| for k, v in masks.items(): | |
| s = s.replace(k, v) | |
| return s | |
| def optimize(self, prompt: str, aggressiveness: float = 0.7) -> tuple: | |
| """Optimize prompt with token counting""" | |
| masked_prompt, masks = self._mask_spans(prompt) | |
| optimized = self._apply_rules(masked_prompt, aggressiveness) | |
| optimized = self._linguistic_optimize(optimized, aggressiveness) | |
| optimized = self._unmask_spans(optimized, masks) | |
| optimized = re.sub(r"\s+", " ", optimized).strip() | |
| try: | |
| orig_tokens = len(self.tokenizer.encode(prompt)) | |
| new_tokens = len(self.tokenizer.encode(optimized)) | |
| except: | |
| orig_tokens = len(prompt.split()) | |
| new_tokens = len(optimized.split()) | |
| return optimized, orig_tokens, new_tokens | |
| def _apply_rules(self, text: str, aggressiveness: float) -> str: | |
| # Apply safer rules first | |
| rules = [ | |
| (r"\s{2,}", " ", 0.0), | |
| (r"\b(\w+)\s+\1\b", r"\1", 0.0), | |
| (r"\b(advantages and disadvantages)\b", "pros/cons", 0.5), | |
| (r"\b(in a detailed manner|in a detailed way)\b", "", 0.7), | |
| (r"\b(I want to|I need to|I would like to)\b", "", 0.7), | |
| (r"\b(for example|e\.g\.|such as|i\.e\.)\b", "e.g.", 0.8), | |
| (r"\b(please\s+)?(kindly\s+)?(carefully|very|extremely|really|quite)\b", "", 0.8), | |
| (r"\b(can you|could you|would you)\b", "", 0.9), | |
| (r"\b(output|provide|give|return)\s+in\s+(JSON|json)\s+format\b", "JSON:", 1.0), | |
| ] | |
| for pattern, repl, priority in rules: | |
| if aggressiveness >= priority: | |
| text = re.sub(pattern, repl, text, flags=re.IGNORECASE) | |
| return text | |
| def _linguistic_optimize(self, text: str, aggressiveness: float) -> str: | |
| if not text.strip(): | |
| return text | |
| doc = self.nlp(text) | |
| out = [] | |
| for token in doc: | |
| # Guard important labels | |
| if token.text.lower() in ["deliverables:", "constraints:", "metrics:"] and token.is_sent_start: | |
| out.append(token.text) | |
| continue | |
| if token.pos_ in ("PUNCT", "SPACE"): continue | |
| if token.like_num or token.ent_type_ or token.dep_ == "neg" or token.text.lower() in self.negation_words: | |
| out.append(token.text) | |
| continue | |
| if token.pos_ in ("PROPN", "NUM", "NOUN", "ADJ"): | |
| out.append(token.text) | |
| continue | |
| if token.pos_ == "VERB": | |
| if aggressiveness >= 0.8: | |
| lemma = getLemma(token.text, upos="VERB") or [token.lemma_] | |
| out.append(lemma[0]) | |
| else: | |
| out.append(token.text) | |
| continue | |
| if token.pos_ in ("ADV", "DET", "PRON"): | |
| if aggressiveness < 0.6: | |
| out.append(token.text) | |
| # else drop | |
| continue | |
| out.append(token.text) | |
| return " ".join(out) | |