from typing import Dict, List, Tuple import hashlib class Pseudonymizer: def __init__(self): # Stores mapping of original entity -> pseudonym per entity type self.entity_map: Dict[str, Dict[str, str]] = {} def _get_hash(self, text: str) -> str: """ Generates a deterministic short hash based on input text. """ return hashlib.md5(text.encode()).hexdigest()[:6] def pseudonymize(self, text: str, entities: List[Dict]) -> Tuple[str, Dict[str, str]]: """ Replace detected entities in text with consistent pseudonyms. Returns pseudonymized text and the mapping used. """ offset_correction = 0 text_out = text local_mapping = {} for ent in sorted(entities, key=lambda x: x["start"]): label = ent["entity_group"] word = ent["word"] if label not in self.entity_map: self.entity_map[label] = {} if word not in self.entity_map[label]: pseudonym = f"[{label}_{self._get_hash(word)}]" self.entity_map[label][word] = pseudonym replacement = self.entity_map[label][word] local_mapping[word] = replacement start = ent["start"] + offset_correction end = ent["end"] + offset_correction text_out = text_out[:start] + replacement + text_out[end:] offset_correction += len(replacement) - (end - start) return text_out, local_mapping