Spaces:
Runtime error
Runtime error
| # make_test_from_all_sessions.py | |
| # Usage from CLI (still works): python make_test_from_all_sessions.py | |
| # Usage from Python: main("path/to/input.json", "path/to/output.jsonl") | |
| import json | |
| import re | |
| from pathlib import Path | |
| from datetime import datetime | |
| # Defaults | |
| DEFAULT_IN = Path("exported_sessions/all_sessions.json") | |
| DEFAULT_OUT = Path("data/orchestrated/pre_annotate.jsonl") | |
| ROLE_MAP = { | |
| "user": "Client", | |
| "assistant": "Therapist", | |
| } | |
| PREFIX_RE = re.compile(r'^\s*(?:User|Bot|Client|Therapist)\s*:\s*', re.IGNORECASE) | |
| def clean_text(text: str) -> str: | |
| if not isinstance(text, str): | |
| return "" | |
| return PREFIX_RE.sub("", text.strip()) | |
| def iso_to_dt(s): | |
| try: | |
| return datetime.fromisoformat(s.replace("Z","")) | |
| except Exception: | |
| return None | |
| def iter_messages(all_sessions): | |
| for sess in all_sessions: | |
| history = sess.get("chat_history", []) or [] | |
| def sort_key(m): | |
| ts = m.get("timestamp") or m.get("created_at") or "" | |
| dt = iso_to_dt(ts) or datetime.max | |
| return (dt, m.get("id", 10**12)) | |
| history = sorted(history, key=sort_key) | |
| for m in history: | |
| role = (m.get("role") or "").lower() | |
| if role not in ROLE_MAP: | |
| continue | |
| text = clean_text(m.get("content") or "") | |
| if not text: | |
| continue | |
| yield {"role": ROLE_MAP[role], "text": text} | |
| def main(in_path: Path = DEFAULT_IN, out_path: Path = DEFAULT_OUT): | |
| in_path = Path(in_path) | |
| out_path = Path(out_path) | |
| if not in_path.exists(): | |
| raise FileNotFoundError(f"Missing {in_path}") | |
| with in_path.open("r", encoding="utf-8") as f: | |
| all_sessions = json.load(f) | |
| rolling_history = [] | |
| n_written = 0 | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| with out_path.open("w", encoding="utf-8") as out: | |
| for msg in iter_messages(all_sessions): | |
| example = { | |
| "history": rolling_history.copy(), | |
| "utterance_role": msg["role"], | |
| "utterance_text": msg["text"], | |
| } | |
| out.write(json.dumps(example, ensure_ascii=False) + "\n") | |
| n_written += 1 | |
| rolling_history.append({"role": msg["role"], "text": msg["text"]}) | |
| print(f"Wrote {n_written} lines to {out_path}") | |
| if __name__ == "__main__": | |
| # Still works from CLI with defaults | |
| main() | |