loc_1

Sleeping

App Files Files Community

Molbap HF Staff commited on Aug 18

Commit

4fa1ace

1 Parent(s): b702ae2

attempt checkpointing

Browse files

Files changed (1) hide show

modular_graph_and_candidates.py +52 -22

modular_graph_and_candidates.py CHANGED Viewed

@@ -95,7 +95,6 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
             out[(m1, m2)] = s
     return out
 @spaces.GPU
 def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
     model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
@@ -113,11 +112,10 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     texts = {}
     for name in tqdm(missing, desc="Reading modeling files"):
-        # Skip models that cause GPU task aborts
         if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
             print(f"Skipping {name} (causes GPU abort)")
             continue
         code = ""
         for py in (models_root / name).rglob("modeling_*.py"):
             try:
@@ -130,29 +128,54 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     all_embeddings = []
     print(f"Encoding embeddings for {len(names)} models...")
-    batch_size = 4  # Reduced to be more conservative
-    for i in tqdm(range(0, len(names), batch_size), desc="Batches", leave=False):
         batch_names = names[i:i+batch_size]
         batch_texts = [texts[name] for name in batch_names]
         try:
             print(f"Processing batch: {batch_names}")
             emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
-            all_embeddings.append(emb)
-            print(f"✓ Completed batch of {len(batch_names)} models")
-            # Clear GPU cache every 3 batches to prevent memory accumulation
-            if i % (3 * batch_size) == 0 and torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()  # Force GPU sync
-                print(f"🧹 Cleared GPU cache after batch {i//batch_size + 1}")
         except Exception as e:
             print(f"⚠️  GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
-            # Create zero embeddings for all models in failed batch
-            zero_emb = np.zeros((len(batch_names), model.get_sentence_embedding_dimension()), dtype=np.float32)
-            all_embeddings.append(zero_emb)
     embeddings = np.vstack(all_embeddings).astype(np.float32)
     norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
@@ -162,19 +185,26 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
     sims_mat = embeddings @ embeddings.T
     out = {}
-    matrix_size = embeddings.shape[0]  # Actual number of embeddings we have
-    processed_names = names[:matrix_size]  # Only use names that have embeddings
     for i in range(matrix_size):
         for j in range(i + 1, matrix_size):
             s = float(sims_mat[i, j])
             if s >= thr:
                 out[(processed_names[i], processed_names[j])] = s
     return out
 # ────────────────────────────────────────────────────────────────────────────────
 # 2)  Scan *modular_*.py* files to build an import‑dependency graph
 #     – only **modeling_*** imports are considered (skip configuration / processing)

             out[(m1, m2)] = s
     return out
 @spaces.GPU
 def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
     model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
     texts = {}
     for name in tqdm(missing, desc="Reading modeling files"):
         if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
             print(f"Skipping {name} (causes GPU abort)")
             continue
         code = ""
         for py in (models_root / name).rglob("modeling_*.py"):
             try:
     all_embeddings = []
     print(f"Encoding embeddings for {len(names)} models...")
+    batch_size = 4  # keep your default
+    # ── checkpoint / resume ────────────────────────────────────────────────────
+    ckpt_path = models_root / "__emb_ckpt.npz"
+    start_idx = 0
+    emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
+    if ckpt_path.exists():
+        try:
+            ckpt = np.load(ckpt_path, allow_pickle=True)
+            ckpt_names = list(ckpt["names"])
+            if names[:len(ckpt_names)] == ckpt_names:
+                loaded = ckpt["embeddings"].astype(np.float32)
+                all_embeddings.append(loaded)
+                start_idx = len(ckpt_names)
+                print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
+        except Exception as e:
+            print(f"⚠️  Failed to load checkpoint: {type(e).__name__}: {e}")
+    # ───────────────────────────────────────────────────────────────────────────
+    for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
         batch_names = names[i:i+batch_size]
         batch_texts = [texts[name] for name in batch_names]
         try:
             print(f"Processing batch: {batch_names}")
             emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
         except Exception as e:
             print(f"⚠️  GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
+            emb = np.zeros((len(batch_names), emb_dim), dtype=np.float32)
+        all_embeddings.append(emb)
+        # save checkpoint after each batch
+        try:
+            cur = np.vstack(all_embeddings).astype(np.float32)
+            np.savez(
+                ckpt_path,
+                embeddings=cur,
+                names=np.array(names[:i+len(batch_names)], dtype=object),
+            )
+        except Exception as e:
+            print(f"⚠️  Failed to write checkpoint: {type(e).__name__}: {e}")
+        if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+            print(f"🧹 Cleared GPU cache after batch {(i - start_idx)//batch_size + 1}")
     embeddings = np.vstack(all_embeddings).astype(np.float32)
     norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
     sims_mat = embeddings @ embeddings.T
     out = {}
+    matrix_size = embeddings.shape[0]
+    processed_names = names[:matrix_size]
     for i in range(matrix_size):
         for j in range(i + 1, matrix_size):
             s = float(sims_mat[i, j])
             if s >= thr:
                 out[(processed_names[i], processed_names[j])] = s
+    # best-effort cleanup
+    try:
+        ckpt_path.unlink()
+    except Exception:
+        pass
     return out
 # ────────────────────────────────────────────────────────────────────────────────
 # 2)  Scan *modular_*.py* files to build an import‑dependency graph
 #     – only **modeling_*** imports are considered (skip configuration / processing)