attempt checkpointing
Browse files- modular_graph_and_candidates.py +52 -22
modular_graph_and_candidates.py
CHANGED
|
@@ -95,7 +95,6 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
| 95 |
out[(m1, m2)] = s
|
| 96 |
return out
|
| 97 |
|
| 98 |
-
|
| 99 |
@spaces.GPU
|
| 100 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 101 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
|
|
@@ -113,11 +112,10 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 113 |
|
| 114 |
texts = {}
|
| 115 |
for name in tqdm(missing, desc="Reading modeling files"):
|
| 116 |
-
# Skip models that cause GPU task aborts
|
| 117 |
if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
|
| 118 |
print(f"Skipping {name} (causes GPU abort)")
|
| 119 |
continue
|
| 120 |
-
|
| 121 |
code = ""
|
| 122 |
for py in (models_root / name).rglob("modeling_*.py"):
|
| 123 |
try:
|
|
@@ -130,29 +128,54 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 130 |
all_embeddings = []
|
| 131 |
|
| 132 |
print(f"Encoding embeddings for {len(names)} models...")
|
| 133 |
-
batch_size = 4 #
|
| 134 |
-
|
| 135 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
batch_names = names[i:i+batch_size]
|
| 137 |
batch_texts = [texts[name] for name in batch_names]
|
| 138 |
-
|
| 139 |
try:
|
| 140 |
print(f"Processing batch: {batch_names}")
|
| 141 |
emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
|
| 142 |
-
all_embeddings.append(emb)
|
| 143 |
-
print(f"β Completed batch of {len(batch_names)} models")
|
| 144 |
-
|
| 145 |
-
# Clear GPU cache every 3 batches to prevent memory accumulation
|
| 146 |
-
if i % (3 * batch_size) == 0 and torch.cuda.is_available():
|
| 147 |
-
torch.cuda.empty_cache()
|
| 148 |
-
torch.cuda.synchronize() # Force GPU sync
|
| 149 |
-
print(f"π§Ή Cleared GPU cache after batch {i//batch_size + 1}")
|
| 150 |
-
|
| 151 |
except Exception as e:
|
| 152 |
print(f"β οΈ GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
| 158 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
|
@@ -162,19 +185,26 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 162 |
sims_mat = embeddings @ embeddings.T
|
| 163 |
|
| 164 |
out = {}
|
| 165 |
-
matrix_size = embeddings.shape[0]
|
| 166 |
-
processed_names = names[:matrix_size]
|
| 167 |
-
|
| 168 |
for i in range(matrix_size):
|
| 169 |
for j in range(i + 1, matrix_size):
|
| 170 |
s = float(sims_mat[i, j])
|
| 171 |
if s >= thr:
|
| 172 |
out[(processed_names[i], processed_names[j])] = s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
return out
|
| 174 |
|
| 175 |
|
| 176 |
|
| 177 |
|
|
|
|
| 178 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 179 |
# 2) Scan *modular_*.py* files to build an importβdependency graph
|
| 180 |
# β only **modeling_*** imports are considered (skip configuration / processing)
|
|
|
|
| 95 |
out[(m1, m2)] = s
|
| 96 |
return out
|
| 97 |
|
|
|
|
| 98 |
@spaces.GPU
|
| 99 |
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 100 |
model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
|
|
|
|
| 112 |
|
| 113 |
texts = {}
|
| 114 |
for name in tqdm(missing, desc="Reading modeling files"):
|
|
|
|
| 115 |
if any(skip in name.lower() for skip in ["mobilebert", "lxmert"]):
|
| 116 |
print(f"Skipping {name} (causes GPU abort)")
|
| 117 |
continue
|
| 118 |
+
|
| 119 |
code = ""
|
| 120 |
for py in (models_root / name).rglob("modeling_*.py"):
|
| 121 |
try:
|
|
|
|
| 128 |
all_embeddings = []
|
| 129 |
|
| 130 |
print(f"Encoding embeddings for {len(names)} models...")
|
| 131 |
+
batch_size = 4 # keep your default
|
| 132 |
+
|
| 133 |
+
# ββ checkpoint / resume ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 134 |
+
ckpt_path = models_root / "__emb_ckpt.npz"
|
| 135 |
+
start_idx = 0
|
| 136 |
+
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
| 137 |
+
|
| 138 |
+
if ckpt_path.exists():
|
| 139 |
+
try:
|
| 140 |
+
ckpt = np.load(ckpt_path, allow_pickle=True)
|
| 141 |
+
ckpt_names = list(ckpt["names"])
|
| 142 |
+
if names[:len(ckpt_names)] == ckpt_names:
|
| 143 |
+
loaded = ckpt["embeddings"].astype(np.float32)
|
| 144 |
+
all_embeddings.append(loaded)
|
| 145 |
+
start_idx = len(ckpt_names)
|
| 146 |
+
print(f"Resuming from checkpoint at {start_idx}/{len(names)}")
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"β οΈ Failed to load checkpoint: {type(e).__name__}: {e}")
|
| 149 |
+
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 150 |
+
|
| 151 |
+
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
| 152 |
batch_names = names[i:i+batch_size]
|
| 153 |
batch_texts = [texts[name] for name in batch_names]
|
| 154 |
+
|
| 155 |
try:
|
| 156 |
print(f"Processing batch: {batch_names}")
|
| 157 |
emb = model.encode(batch_texts, convert_to_numpy=True, show_progress_bar=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
except Exception as e:
|
| 159 |
print(f"β οΈ GPU worker error for batch {batch_names}: {type(e).__name__}: {e}")
|
| 160 |
+
emb = np.zeros((len(batch_names), emb_dim), dtype=np.float32)
|
| 161 |
+
|
| 162 |
+
all_embeddings.append(emb)
|
| 163 |
+
|
| 164 |
+
# save checkpoint after each batch
|
| 165 |
+
try:
|
| 166 |
+
cur = np.vstack(all_embeddings).astype(np.float32)
|
| 167 |
+
np.savez(
|
| 168 |
+
ckpt_path,
|
| 169 |
+
embeddings=cur,
|
| 170 |
+
names=np.array(names[:i+len(batch_names)], dtype=object),
|
| 171 |
+
)
|
| 172 |
+
except Exception as e:
|
| 173 |
+
print(f"β οΈ Failed to write checkpoint: {type(e).__name__}: {e}")
|
| 174 |
+
|
| 175 |
+
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
| 176 |
+
torch.cuda.empty_cache()
|
| 177 |
+
torch.cuda.synchronize()
|
| 178 |
+
print(f"π§Ή Cleared GPU cache after batch {(i - start_idx)//batch_size + 1}")
|
| 179 |
|
| 180 |
embeddings = np.vstack(all_embeddings).astype(np.float32)
|
| 181 |
norms = np.linalg.norm(embeddings, axis=1, keepdims=True) + 1e-12
|
|
|
|
| 185 |
sims_mat = embeddings @ embeddings.T
|
| 186 |
|
| 187 |
out = {}
|
| 188 |
+
matrix_size = embeddings.shape[0]
|
| 189 |
+
processed_names = names[:matrix_size]
|
|
|
|
| 190 |
for i in range(matrix_size):
|
| 191 |
for j in range(i + 1, matrix_size):
|
| 192 |
s = float(sims_mat[i, j])
|
| 193 |
if s >= thr:
|
| 194 |
out[(processed_names[i], processed_names[j])] = s
|
| 195 |
+
|
| 196 |
+
# best-effort cleanup
|
| 197 |
+
try:
|
| 198 |
+
ckpt_path.unlink()
|
| 199 |
+
except Exception:
|
| 200 |
+
pass
|
| 201 |
+
|
| 202 |
return out
|
| 203 |
|
| 204 |
|
| 205 |
|
| 206 |
|
| 207 |
+
|
| 208 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 209 |
# 2) Scan *modular_*.py* files to build an importβdependency graph
|
| 210 |
# β only **modeling_*** imports are considered (skip configuration / processing)
|