File size: 2,507 Bytes
0b70f11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# make_test_from_all_sessions.py
# Usage from CLI (still works): python make_test_from_all_sessions.py
# Usage from Python: main("path/to/input.json", "path/to/output.jsonl")

import json
import re
from pathlib import Path
from datetime import datetime

# Defaults 
DEFAULT_IN = Path("exported_sessions/all_sessions.json")
DEFAULT_OUT = Path("data/orchestrated/pre_annotate.jsonl")

ROLE_MAP = {
    "user": "Client",
    "assistant": "Therapist",
}

PREFIX_RE = re.compile(r'^\s*(?:User|Bot|Client|Therapist)\s*:\s*', re.IGNORECASE)

def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    return PREFIX_RE.sub("", text.strip())

def iso_to_dt(s):
    try:
        return datetime.fromisoformat(s.replace("Z",""))
    except Exception:
        return None

def iter_messages(all_sessions):
    for sess in all_sessions:
        history = sess.get("chat_history", []) or []

        def sort_key(m):
            ts = m.get("timestamp") or m.get("created_at") or ""
            dt = iso_to_dt(ts) or datetime.max
            return (dt, m.get("id", 10**12))
        history = sorted(history, key=sort_key)

        for m in history:
            role = (m.get("role") or "").lower()
            if role not in ROLE_MAP:
                continue
            text = clean_text(m.get("content") or "")
            if not text:
                continue
            yield {"role": ROLE_MAP[role], "text": text}

def main(in_path: Path = DEFAULT_IN, out_path: Path = DEFAULT_OUT):
    in_path = Path(in_path)
    out_path = Path(out_path)

    if not in_path.exists():
        raise FileNotFoundError(f"Missing {in_path}")
    with in_path.open("r", encoding="utf-8") as f:
        all_sessions = json.load(f)

    rolling_history = []
    n_written = 0
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as out:
        for msg in iter_messages(all_sessions):
            example = {
                "history": rolling_history.copy(),
                "utterance_role": msg["role"],
                "utterance_text": msg["text"],
            }
            out.write(json.dumps(example, ensure_ascii=False) + "\n")
            n_written += 1
            rolling_history.append({"role": msg["role"], "text": msg["text"]})

    print(f"Wrote {n_written} lines to {out_path}")

if __name__ == "__main__":
    # Still works from CLI with defaults
    main()