update
Browse files- modular_graph_and_candidates.py +69 -16
modular_graph_and_candidates.py
CHANGED
|
@@ -130,22 +130,35 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 130 |
print(f"Encoding embeddings for {len(names)} models...")
|
| 131 |
batch_size = 4 # keep your default
|
| 132 |
|
| 133 |
-
# ββ
|
| 134 |
-
|
|
|
|
| 135 |
start_idx = 0
|
| 136 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
| 137 |
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
try:
|
| 140 |
-
cached = np.load(
|
| 141 |
cached_names = list(cached["names"])
|
| 142 |
if names[:len(cached_names)] == cached_names:
|
| 143 |
loaded = cached["embeddings"].astype(np.float32)
|
| 144 |
all_embeddings.append(loaded)
|
| 145 |
start_idx = len(cached_names)
|
| 146 |
-
print(f"
|
| 147 |
except Exception as e:
|
| 148 |
-
print(f"β οΈ Failed to load
|
| 149 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 150 |
|
| 151 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
|
@@ -161,16 +174,16 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 161 |
|
| 162 |
all_embeddings.append(emb)
|
| 163 |
|
| 164 |
-
# save to
|
| 165 |
try:
|
| 166 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
| 167 |
np.savez(
|
| 168 |
-
|
| 169 |
embeddings=cur,
|
| 170 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
| 171 |
)
|
| 172 |
except Exception as e:
|
| 173 |
-
print(f"β οΈ Failed to write
|
| 174 |
|
| 175 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
| 176 |
torch.cuda.empty_cache()
|
|
@@ -193,7 +206,17 @@ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: fl
|
|
| 193 |
if s >= thr:
|
| 194 |
out[(processed_names[i], processed_names[j])] = s
|
| 195 |
|
| 196 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
return out
|
| 198 |
|
| 199 |
def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
|
|
@@ -326,20 +349,50 @@ def build_graph_json(
|
|
| 326 |
print(f"π Got {len(cached_sims)} cached similarities")
|
| 327 |
|
| 328 |
if cached_sims:
|
| 329 |
-
# Create
|
| 330 |
cached_data = np.load(embeddings_cache, allow_pickle=True)
|
| 331 |
missing = list(cached_data["names"])
|
| 332 |
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
| 336 |
|
|
|
|
|
|
|
| 337 |
links = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
for (a, b), s in cached_sims.items():
|
| 339 |
links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
|
| 340 |
|
| 341 |
-
|
| 342 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
except Exception as e:
|
| 344 |
print(f"β οΈ Cache-only build failed: {e}, falling back to full build")
|
| 345 |
|
|
|
|
| 130 |
print(f"Encoding embeddings for {len(names)} models...")
|
| 131 |
batch_size = 4 # keep your default
|
| 132 |
|
| 133 |
+
# ββ two-stage caching: temp (for resume) + permanent (for reuse) βββββββββββββ
|
| 134 |
+
temp_cache_path = Path("temp_embeddings.npz") # For resuming computation
|
| 135 |
+
final_cache_path = Path("embeddings_cache.npz") # For permanent storage
|
| 136 |
start_idx = 0
|
| 137 |
emb_dim = getattr(model, "get_sentence_embedding_dimension", lambda: 768)()
|
| 138 |
|
| 139 |
+
# Try to load from permanent cache first
|
| 140 |
+
if final_cache_path.exists():
|
| 141 |
+
try:
|
| 142 |
+
cached = np.load(final_cache_path, allow_pickle=True)
|
| 143 |
+
cached_names = list(cached["names"])
|
| 144 |
+
if names == cached_names: # Exact match - use final cache
|
| 145 |
+
print(f"β
Using final embeddings cache ({len(cached_names)} models)")
|
| 146 |
+
return compute_similarities_from_cache(thr)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
print(f"β οΈ Failed to load final cache: {e}")
|
| 149 |
+
|
| 150 |
+
# Try to resume from temp cache
|
| 151 |
+
if temp_cache_path.exists():
|
| 152 |
try:
|
| 153 |
+
cached = np.load(temp_cache_path, allow_pickle=True)
|
| 154 |
cached_names = list(cached["names"])
|
| 155 |
if names[:len(cached_names)] == cached_names:
|
| 156 |
loaded = cached["embeddings"].astype(np.float32)
|
| 157 |
all_embeddings.append(loaded)
|
| 158 |
start_idx = len(cached_names)
|
| 159 |
+
print(f"π Resuming from temp cache: {start_idx}/{len(names)} models")
|
| 160 |
except Exception as e:
|
| 161 |
+
print(f"β οΈ Failed to load temp cache: {e}")
|
| 162 |
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 163 |
|
| 164 |
for i in tqdm(range(start_idx, len(names), batch_size), desc="Batches", leave=False):
|
|
|
|
| 174 |
|
| 175 |
all_embeddings.append(emb)
|
| 176 |
|
| 177 |
+
# save to temp cache after each batch (for resume)
|
| 178 |
try:
|
| 179 |
cur = np.vstack(all_embeddings).astype(np.float32)
|
| 180 |
np.savez(
|
| 181 |
+
temp_cache_path,
|
| 182 |
embeddings=cur,
|
| 183 |
names=np.array(names[:i+len(batch_names)], dtype=object),
|
| 184 |
)
|
| 185 |
except Exception as e:
|
| 186 |
+
print(f"β οΈ Failed to write temp cache: {e}")
|
| 187 |
|
| 188 |
if (i - start_idx) % (3 * batch_size) == 0 and torch.cuda.is_available():
|
| 189 |
torch.cuda.empty_cache()
|
|
|
|
| 206 |
if s >= thr:
|
| 207 |
out[(processed_names[i], processed_names[j])] = s
|
| 208 |
|
| 209 |
+
# Save to final cache when complete
|
| 210 |
+
try:
|
| 211 |
+
np.savez(final_cache_path, embeddings=embeddings, names=np.array(names, dtype=object))
|
| 212 |
+
print(f"πΎ Final embeddings saved to {final_cache_path}")
|
| 213 |
+
# Clean up temp cache
|
| 214 |
+
if temp_cache_path.exists():
|
| 215 |
+
temp_cache_path.unlink()
|
| 216 |
+
print(f"π§Ή Cleaned up temp cache")
|
| 217 |
+
except Exception as e:
|
| 218 |
+
print(f"β οΈ Failed to save final cache: {e}")
|
| 219 |
+
|
| 220 |
return out
|
| 221 |
|
| 222 |
def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], float]:
|
|
|
|
| 349 |
print(f"π Got {len(cached_sims)} cached similarities")
|
| 350 |
|
| 351 |
if cached_sims:
|
| 352 |
+
# Create graph with cached similarities + modular dependencies
|
| 353 |
cached_data = np.load(embeddings_cache, allow_pickle=True)
|
| 354 |
missing = list(cached_data["names"])
|
| 355 |
|
| 356 |
+
# Still need to get modular dependencies from repo
|
| 357 |
+
models_root = transformers_dir / "src/transformers/models"
|
| 358 |
+
mod_files = modular_files(models_root)
|
| 359 |
+
deps = dependency_graph(mod_files, models_root)
|
| 360 |
|
| 361 |
+
# Build full graph structure
|
| 362 |
+
nodes = set(missing) # Start with cached models
|
| 363 |
links = []
|
| 364 |
+
|
| 365 |
+
# Add dependency links
|
| 366 |
+
for drv, lst in deps.items():
|
| 367 |
+
for d in lst:
|
| 368 |
+
links.append({
|
| 369 |
+
"source": d["source"],
|
| 370 |
+
"target": drv,
|
| 371 |
+
"label": f"{sum(1 for x in lst if x['source'] == d['source'])} imports",
|
| 372 |
+
"cand": False
|
| 373 |
+
})
|
| 374 |
+
nodes.update({d["source"], drv})
|
| 375 |
+
|
| 376 |
+
# Add similarity links
|
| 377 |
for (a, b), s in cached_sims.items():
|
| 378 |
links.append({"source": a, "target": b, "label": f"{s*100:.1f}%", "cand": True})
|
| 379 |
|
| 380 |
+
# Create node list with proper classification
|
| 381 |
+
targets = {lk["target"] for lk in links if not lk["cand"]}
|
| 382 |
+
sources = {lk["source"] for lk in links if not lk["cand"]}
|
| 383 |
+
|
| 384 |
+
nodelist = []
|
| 385 |
+
for n in sorted(nodes):
|
| 386 |
+
if n in missing and n not in sources and n not in targets:
|
| 387 |
+
cls = "cand"
|
| 388 |
+
elif n in sources and n not in targets:
|
| 389 |
+
cls = "base"
|
| 390 |
+
else:
|
| 391 |
+
cls = "derived"
|
| 392 |
+
nodelist.append({"id": n, "cls": cls, "sz": 1})
|
| 393 |
+
|
| 394 |
+
print(f"β‘ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
|
| 395 |
+
return {"nodes": nodelist, "links": links}
|
| 396 |
except Exception as e:
|
| 397 |
print(f"β οΈ Cache-only build failed: {e}, falling back to full build")
|
| 398 |
|