Spaces:
Runtime error
Runtime error
File size: 2,507 Bytes
0b70f11 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
# make_test_from_all_sessions.py
# Usage from CLI (still works): python make_test_from_all_sessions.py
# Usage from Python: main("path/to/input.json", "path/to/output.jsonl")
import json
import re
from pathlib import Path
from datetime import datetime
# Defaults
DEFAULT_IN = Path("exported_sessions/all_sessions.json")
DEFAULT_OUT = Path("data/orchestrated/pre_annotate.jsonl")
ROLE_MAP = {
"user": "Client",
"assistant": "Therapist",
}
PREFIX_RE = re.compile(r'^\s*(?:User|Bot|Client|Therapist)\s*:\s*', re.IGNORECASE)
def clean_text(text: str) -> str:
if not isinstance(text, str):
return ""
return PREFIX_RE.sub("", text.strip())
def iso_to_dt(s):
try:
return datetime.fromisoformat(s.replace("Z",""))
except Exception:
return None
def iter_messages(all_sessions):
for sess in all_sessions:
history = sess.get("chat_history", []) or []
def sort_key(m):
ts = m.get("timestamp") or m.get("created_at") or ""
dt = iso_to_dt(ts) or datetime.max
return (dt, m.get("id", 10**12))
history = sorted(history, key=sort_key)
for m in history:
role = (m.get("role") or "").lower()
if role not in ROLE_MAP:
continue
text = clean_text(m.get("content") or "")
if not text:
continue
yield {"role": ROLE_MAP[role], "text": text}
def main(in_path: Path = DEFAULT_IN, out_path: Path = DEFAULT_OUT):
in_path = Path(in_path)
out_path = Path(out_path)
if not in_path.exists():
raise FileNotFoundError(f"Missing {in_path}")
with in_path.open("r", encoding="utf-8") as f:
all_sessions = json.load(f)
rolling_history = []
n_written = 0
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as out:
for msg in iter_messages(all_sessions):
example = {
"history": rolling_history.copy(),
"utterance_role": msg["role"],
"utterance_text": msg["text"],
}
out.write(json.dumps(example, ensure_ascii=False) + "\n")
n_written += 1
rolling_history.append({"role": msg["role"], "text": msg["text"]})
print(f"Wrote {n_written} lines to {out_path}")
if __name__ == "__main__":
# Still works from CLI with defaults
main()
|