kallam-demo-docker / scripts /in_data_preprocessor.py
Koalar's picture
Upload 19 files
0b70f11 verified
raw
history blame
2.51 kB
# make_test_from_all_sessions.py
# Usage from CLI (still works): python make_test_from_all_sessions.py
# Usage from Python: main("path/to/input.json", "path/to/output.jsonl")
import json
import re
from pathlib import Path
from datetime import datetime
# Defaults
DEFAULT_IN = Path("exported_sessions/all_sessions.json")
DEFAULT_OUT = Path("data/orchestrated/pre_annotate.jsonl")
ROLE_MAP = {
"user": "Client",
"assistant": "Therapist",
}
PREFIX_RE = re.compile(r'^\s*(?:User|Bot|Client|Therapist)\s*:\s*', re.IGNORECASE)
def clean_text(text: str) -> str:
if not isinstance(text, str):
return ""
return PREFIX_RE.sub("", text.strip())
def iso_to_dt(s):
try:
return datetime.fromisoformat(s.replace("Z",""))
except Exception:
return None
def iter_messages(all_sessions):
for sess in all_sessions:
history = sess.get("chat_history", []) or []
def sort_key(m):
ts = m.get("timestamp") or m.get("created_at") or ""
dt = iso_to_dt(ts) or datetime.max
return (dt, m.get("id", 10**12))
history = sorted(history, key=sort_key)
for m in history:
role = (m.get("role") or "").lower()
if role not in ROLE_MAP:
continue
text = clean_text(m.get("content") or "")
if not text:
continue
yield {"role": ROLE_MAP[role], "text": text}
def main(in_path: Path = DEFAULT_IN, out_path: Path = DEFAULT_OUT):
in_path = Path(in_path)
out_path = Path(out_path)
if not in_path.exists():
raise FileNotFoundError(f"Missing {in_path}")
with in_path.open("r", encoding="utf-8") as f:
all_sessions = json.load(f)
rolling_history = []
n_written = 0
out_path.parent.mkdir(parents=True, exist_ok=True)
with out_path.open("w", encoding="utf-8") as out:
for msg in iter_messages(all_sessions):
example = {
"history": rolling_history.copy(),
"utterance_role": msg["role"],
"utterance_text": msg["text"],
}
out.write(json.dumps(example, ensure_ascii=False) + "\n")
n_written += 1
rolling_history.append({"role": msg["role"], "text": msg["text"]})
print(f"Wrote {n_written} lines to {out_path}")
if __name__ == "__main__":
# Still works from CLI with defaults
main()