Spaces:

AbstractPhil
/

bert-beatrix-2048-testing

Sleeping

App Files Files Community

AbstractPhil commited on Jun 2, 2025

Commit

0a14990

verified ·

1 Parent(s): da8b548

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -128

app.py CHANGED Viewed

@@ -1,57 +1,45 @@
-# app.py – encoder-only demo + pool-and-test prototype
-# ----------------------------------------------------
-# launch:  python app.py
-# UI: http://localhost:7860
-import json, re, sys, math
 from pathlib import Path, PurePosixPath
-import torch, torch.nn.functional as F
-import gradio as gr, spaces
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
-# 0. One-time patch of auto_map in config.json
-# ------------------------------------------------------------------
-REPO_ID   = "AbstractPhil/bert-beatrix-2048"
-LOCAL_CKPT = "bert-beatrix-2048"
-snapshot_download(
-    repo_id=REPO_ID,
-    revision="main",
-    local_dir=LOCAL_CKPT,
-    local_dir_use_symlinks=False,
-)
-cfg_path = Path(LOCAL_CKPT) / "config.json"
-cfg      = json.loads(cfg_path.read_text())
-auto_map = cfg.get("auto_map", {})
-changed  = False
-for k, v in auto_map.items():
-    if "--" in v:                         # strip “repo--”
-        auto_map[k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
-        changed = True
-if changed:
-    cfg_path.write_text(json.dumps(cfg, indent=2))
-    print("🛠️  Patched config.json → auto_map points to local modules")
 # ------------------------------------------------------------------
-# 1. Load model + tokenizer with BERTHandler
-# ------------------------------------------------------------------
-handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CKPT)
 full_model = full_model.eval().cuda()
-# pull encoder & embedding stack
-encoder     = full_model.bert.encoder
-embeddings  = full_model.bert.embeddings
-emb_weight  = embeddings.word_embeddings.weight  # <- correct tensor
-emb_ln      = full_model.bert.emb_ln
-emb_drop    = full_model.bert.emb_drop
 # ------------------------------------------------------------------
-# 2. Symbolic roles
-# ------------------------------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
@@ -60,108 +48,96 @@ SYMBOLIC_ROLES = [
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
-missing = [t for t in SYMBOLIC_ROLES
-           if tokenizer.convert_tokens_to_ids(t) == tokenizer.unk_token_id]
-if missing:
-    sys.exit(f"❌ Tokenizer missing {missing}")
-MASK_ID   = tokenizer.mask_token_id
-MASK_TOK  = tokenizer.mask_token
 # ------------------------------------------------------------------
-# helpers -----------------------------------------------------------
-def contextual_vectors(ids, mask):
-    """run through embedding→encoder, return (S,H) hidden states"""
-    x = emb_drop(emb_ln(embeddings(ids)))                    # (1,S,H)
-    ext = full_model.bert.get_extended_attention_mask(mask, x.shape[:-1])
-    return encoder(x, attention_mask=ext).squeeze(0)         # (S,H)
-def pool_accuracy(ids, mask, pool_positions):
-    """mask positions in pool, predict, calc accuracy"""
-    masked = ids.clone()
-    masked[0, pool_positions] = MASK_ID
-    with torch.no_grad():
-        logits = full_model(masked, attention_mask=mask).logits[0]
-    preds  = logits.argmax(-1)
-    gold   = ids.squeeze(0)
-    correct = (preds[pool_positions] == gold[pool_positions]).sum().item()
-    return correct / len(pool_positions) if pool_positions else 0.0
-# cosine utility
-def cos(a, b): return F.cosine_similarity(a, b, dim=-1, eps=1e-8).item()
-# ------------------------------------------------------------------
-# 3. Core routine ---------------------------------------------------
-@spaces.GPU
-def encode_and_trace(text: str, picked_roles: list[str]):
-    # -------- tokenise ----------
-    batch = tokenizer(text, return_tensors="pt").to("cuda")
-    ids, attn = batch.input_ids, batch.attention_mask
-    hid = contextual_vectors(ids, attn)                      # (S,H)
-    # -------- decide which roles we analyse ----------
-    present = {tid: pos for pos, tid in enumerate(ids[0].tolist())
-               if tid in {tokenizer.convert_tokens_to_ids(r) for r in SYMBOLIC_ROLES}}
-    if picked_roles:
-        present = {tid: pos for tid, pos in present.items()
-                   if tokenizer.convert_ids_to_tokens([tid])[0] in picked_roles}
-    if not present:
-        return "No symbolic tokens in sentence", "", ""
-    # -------- similarity scores ----------
-    sims = []
-    for tid, pos in present.items():
-        rvec = emb_weight[tid]               # static embedding
-        cvec = hid[pos]                      # contextual
-        sims.append((cos(cvec, rvec), tid, pos))
-    sims.sort()                              # low → high
-    # pools: bottom-2, top-2  (expand later)
-    low_pool, high_pool = sims[:2], sims[-2:]
-    accepted = []
-    for grow in range(1 + math.ceil(len(sims)/2)):   # ≤26 shots
-        for tag, pool in [("low", low_pool), ("high", high_pool)]:
-            pool_pos = [p for _,_,p in pool]
-            acc = pool_accuracy(ids, attn, pool_pos)
-            if acc >= 0.5:        # category accepted
-                roles = [tokenizer.convert_ids_to_tokens([tid])[0] for _,tid,_ in pool]
-                accepted.append(f"{tag}:{roles} (acc {acc:.2f})")
-        if accepted: break        # stop once something passed
-        # grow pools by two (if any left)
-        next_lo = sims[2+grow*2 : 4+grow*2]
-        next_hi = sims[-4-grow*2 : -2-grow*2] if 4+grow*2 <= len(sims) else []
-        low_pool  += next_lo
-        high_pool += next_hi
-    if not accepted:
-        accepted = ["(none hit 50 %)"]
-    return ", ".join(accepted), f"{len(present)} roles analysed", f"{text[:80]}…"
 # ------------------------------------------------------------------
-# 4. UI -------------------------------------------------------------
-def build_ui():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
             "## 🧠 Symbolic Encoder Inspector  \n"
-            "Select roles, paste text, and watch the pool-and-test prototype work."
         )
         with gr.Row():
             with gr.Column():
-                txt = gr.Textbox(lines=3, label="Input")
                 roles = gr.CheckboxGroup(
-                    SYMBOLIC_ROLES,
-                    value=SYMBOLIC_ROLES,
-                    label="Roles to consider (else all present)"
                 )
-                btn = gr.Button("Run")
             with gr.Column():
-                out_cat = gr.Textbox(label="Accepted categories")
-                out_info= gr.Textbox(label="Debug")
-                out_excerpt = gr.Textbox(label="Excerpt")
-        btn.click(encode_and_trace, [txt, roles], [out_cat, out_info, out_excerpt])
     return demo
 if __name__ == "__main__":
-    build_ui().launch()

+# app.py – encoder-only + masking accuracy demo for bert-beatrix-2048
+# -----------------------------------------------------------------
+# launch:  python app.py      (UI at http://localhost:7860)
+import json, re, sys
 from pathlib import Path, PurePosixPath
+import gradio as gr
+import spaces
+import torch
 from huggingface_hub import snapshot_download
 from bert_handler import create_handler_from_checkpoint
 # ------------------------------------------------------------------
+# 0.  download repo + patch auto_map --------------------------------
+REPO_ID  = "AbstractPhil/bert-beatrix-2048"
+LOCAL_CK = "bert-beatrix-2048"
+snapshot_download(repo_id=REPO_ID, local_dir=LOCAL_CK, local_dir_use_symlinks=False)
+cfg_p = Path(LOCAL_CK) / "config.json"
+with cfg_p.open() as f:
+    cfg = json.load(f)
+for k, v in cfg.get("auto_map", {}).items():
+    if "--" in v:
+        cfg["auto_map"][k] = PurePosixPath(v.split("--", 1)[1]).as_posix()
+        with cfg_p.open("w") as f:
+            json.dump(cfg, f, indent=2)
 # ------------------------------------------------------------------
+# 1.  load model / tokenizer ---------------------------------------
+handler, full_model, tokenizer = create_handler_from_checkpoint(LOCAL_CK)
 full_model = full_model.eval().cuda()
+encoder    = full_model.bert.encoder
+embeddings = full_model.bert.embeddings
+emb_ln     = full_model.bert.emb_ln
+emb_drop   = full_model.bert.emb_drop
+MASK = tokenizer.mask_token or "[MASK]"
 # ------------------------------------------------------------------
+# 2.  symbolic role list -------------------------------------------
 SYMBOLIC_ROLES = [
     "<subject>", "<subject1>", "<subject2>", "<pose>", "<emotion>",
     "<surface>", "<lighting>", "<material>", "<accessory>", "<footwear>",
     "<object_left>", "<object_right>", "<relation>", "<intent>", "<style>",
     "<fabric>", "<jewelry>",
 ]
+miss = [t for t in SYMBOLIC_ROLES
+        if tokenizer.convert_tokens_to_ids(t) == tokenizer.unk_token_id]
+if miss:
+    sys.exit(f"❌ Tokenizer missing {miss}")
 # ------------------------------------------------------------------
+# 3.  inference util  ----------------------------------------------
+@spaces.GPU
+def encode_and_trace(text: str, selected_roles: list[str]):
+    # ----- 3-A. build masked version & encode original --------------
+    sel_ids = {tokenizer.convert_tokens_to_ids(t) for t in selected_roles}
+    # tokenised “plain” text
+    plain = tokenizer(text, return_tensors="pt").to("cuda")
+    ids_plain = plain.input_ids
+    # make masked string (regex to avoid partial hits)
+    masked_txt = text
+    for tok in selected_roles:
+        masked_txt = re.sub(re.escape(tok), MASK, masked_txt)
+    masked = tokenizer(masked_txt, return_tensors="pt").to("cuda")
+    ids_masked = masked.input_ids
+    # ----- 3-B. run model on masked text ----------------------------
+    with torch.no_grad():
+        logits = full_model(**masked).logits[0]          # (S, V)
+        preds  = logits.argmax(-1)                       # (S,)
+    # ----- 3-C. gather stats per masked role ------------------------
+    found_tokens, correct = [], 0
+    role_flags = []
+    for i, (orig_id, pred_id) in enumerate(zip(ids_plain[0], preds)):
+        if orig_id.item() in sel_ids and ids_masked[0, i].item() == tokenizer.mask_token_id:
+            found_tokens.append(tokenizer.convert_ids_to_tokens([orig_id])[0])
+            correct += int(orig_id.item() == pred_id.item())
+            role_flags.append(i)
+    total = len(role_flags)
+    acc   = correct / total if total else 0.0
+    # ----- 3-D. encoder rep pooling for *all* selected roles --------
+    with torch.no_grad():
+        # embeddings -> normed reps
+        x = emb_drop(emb_ln(embeddings(ids_plain)))
+        attn = full_model.bert.get_extended_attention_mask(
+            plain.attention_mask, x.shape[:-1]
+        )
+        enc = encoder(x, attention_mask=attn)            # (1,S,H)
+        mask_vec = torch.tensor(
+            [tid in sel_ids for tid in ids_plain[0].tolist()], device=enc.device
+        )
+        if mask_vec.any():
+            pooled = enc[0][mask_vec].mean(0)
+            norm   = f"{pooled.norm().item():.4f}"
+        else:
+            norm   = "0.0000"
+    tokens_str = ", ".join(found_tokens) or "(none)"
+    return tokens_str, norm, f"{acc*100:.1f}%"
 # ------------------------------------------------------------------
+# 4.  gradio UI  ----------------------------------------------------
+def app():
     with gr.Blocks(title="🧠 Symbolic Encoder Inspector") as demo:
         gr.Markdown(
             "## 🧠 Symbolic Encoder Inspector  \n"
+            "1. Model side: we *mask* every chosen role token, run the LM, and report how often it recovers the original.  \n"
+            "2. Encoder side: we also pool hidden-state vectors for those roles and give their mean L2-norm."
         )
         with gr.Row():
             with gr.Column():
+                txt = gr.Textbox(
+                    label="Input with Symbolic Tokens",
+                    lines=3,
+                    placeholder="Example: A <subject> wearing <upper_body_clothing> …",
+                )
                 roles = gr.CheckboxGroup(
+                    choices=SYMBOLIC_ROLES,
+                    value=SYMBOLIC_ROLES,            # <- all pre-selected
+                    label="Roles to mask & trace",
                 )
+                run = gr.Button("Run")
             with gr.Column():
+                o_tok  = gr.Textbox(label="Masked-role tokens found")
+                o_norm = gr.Textbox(label="Mean hidden-state L2-norm")
+                o_acc  = gr.Textbox(label="Recovery accuracy")
+        run.click(encode_and_trace, [txt, roles], [o_tok, o_norm, o_acc])
     return demo
 if __name__ == "__main__":
+    app().launch()