setup cache
Browse files- app.py +44 -11
- build_cache.py +107 -0
- modular_graph_and_candidates.py +2 -33
app.py
CHANGED
|
@@ -9,12 +9,47 @@ import tempfile
|
|
| 9 |
from datetime import datetime, timedelta
|
| 10 |
from functools import lru_cache
|
| 11 |
from pathlib import Path
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
import gradio as gr
|
| 14 |
|
| 15 |
# —— refactored helpers ——
|
| 16 |
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
HF_MAIN_REPO = "https://github.com/huggingface/transformers"
|
| 19 |
|
| 20 |
# ───────────────────────────── cache repo once per 24 h ───────────────────────────
|
|
@@ -40,19 +75,13 @@ def clone_or_cache(repo_url: str) -> Path:
|
|
| 40 |
|
| 41 |
# ───────────────────────────── main callback ─────────────────────────────────────
|
| 42 |
|
| 43 |
-
def _escape_srcdoc(text: str) -> str:
|
| 44 |
-
"""Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
|
| 45 |
-
return (
|
| 46 |
-
text.replace("&", "&")
|
| 47 |
-
.replace("\"", """)
|
| 48 |
-
.replace("'", "'")
|
| 49 |
-
.replace("<", "<")
|
| 50 |
-
.replace(">", ">")
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
|
| 54 |
def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
| 55 |
"""Generate the dependency graph visualization."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
repo_path = clone_or_cache(repo_url)
|
| 57 |
|
| 58 |
graph = build_graph_json(
|
|
@@ -75,6 +104,10 @@ def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str
|
|
| 75 |
|
| 76 |
def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
| 77 |
"""Generate the chronological timeline visualization."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
repo_path = clone_or_cache(repo_url)
|
| 79 |
|
| 80 |
timeline = build_timeline_json(
|
|
|
|
| 9 |
from datetime import datetime, timedelta
|
| 10 |
from functools import lru_cache
|
| 11 |
from pathlib import Path
|
| 12 |
+
import os, json, tempfile
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from huggingface_hub import hf_hub_download
|
| 15 |
import gradio as gr
|
| 16 |
|
| 17 |
# —— refactored helpers ——
|
| 18 |
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
|
| 19 |
|
| 20 |
+
def _escape_srcdoc(text: str) -> str:
|
| 21 |
+
"""Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
|
| 22 |
+
return (
|
| 23 |
+
text.replace("&", "&")
|
| 24 |
+
.replace("\"", """)
|
| 25 |
+
.replace("'", "'")
|
| 26 |
+
.replace("<", "<")
|
| 27 |
+
.replace(">", ">")
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimodal: bool):
|
| 31 |
+
"""Fetch cached data from Molbap/hf_cached_embeds_log repo."""
|
| 32 |
+
|
| 33 |
+
repo_id = "Molbap/hf_cached_embeds_log"
|
| 34 |
+
try:
|
| 35 |
+
latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json")
|
| 36 |
+
info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
|
| 37 |
+
sha = info.get("sha")
|
| 38 |
+
key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
|
| 39 |
+
|
| 40 |
+
html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html")
|
| 41 |
+
json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json")
|
| 42 |
+
|
| 43 |
+
raw_html = Path(html_fp).read_text(encoding="utf-8")
|
| 44 |
+
json_text = Path(json_fp).read_text(encoding="utf-8")
|
| 45 |
+
|
| 46 |
+
iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
|
| 47 |
+
tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
|
| 48 |
+
tmp.write_text(json_text, encoding="utf-8")
|
| 49 |
+
return iframe_html, str(tmp)
|
| 50 |
+
except Exception:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
HF_MAIN_REPO = "https://github.com/huggingface/transformers"
|
| 54 |
|
| 55 |
# ───────────────────────────── cache repo once per 24 h ───────────────────────────
|
|
|
|
| 75 |
|
| 76 |
# ───────────────────────────── main callback ─────────────────────────────────────
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
def run_graph(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
| 80 |
"""Generate the dependency graph visualization."""
|
| 81 |
+
hit = _fetch_from_cache_repo("graph", sim_method, threshold, multimodal)
|
| 82 |
+
if hit:
|
| 83 |
+
return hit
|
| 84 |
+
|
| 85 |
repo_path = clone_or_cache(repo_url)
|
| 86 |
|
| 87 |
graph = build_graph_json(
|
|
|
|
| 104 |
|
| 105 |
def run_timeline(repo_url: str, threshold: float, multimodal: bool, sim_method: str):
|
| 106 |
"""Generate the chronological timeline visualization."""
|
| 107 |
+
hit = _fetch_from_cache_repo("timeline", sim_method, threshold, multimodal)
|
| 108 |
+
if hit:
|
| 109 |
+
return hit
|
| 110 |
+
|
| 111 |
repo_path = clone_or_cache(repo_url)
|
| 112 |
|
| 113 |
timeline = build_timeline_json(
|
build_cache.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
import subprocess
|
| 4 |
+
import tempfile
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from datetime import datetime, timezone
|
| 7 |
+
from huggingface_hub import HfApi
|
| 8 |
+
|
| 9 |
+
from modular_graph_and_candidates import (
|
| 10 |
+
build_graph_json,
|
| 11 |
+
generate_html,
|
| 12 |
+
build_timeline_json,
|
| 13 |
+
generate_timeline_html
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
|
| 17 |
+
CACHE_REPO = "Molbap/hf_cached_embeds_log"
|
| 18 |
+
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
|
| 19 |
+
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
|
| 20 |
+
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
|
| 21 |
+
|
| 22 |
+
def main():
|
| 23 |
+
print(f"Building cache for {REPO_URL}")
|
| 24 |
+
print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}")
|
| 25 |
+
|
| 26 |
+
tmp = Path(tempfile.mkdtemp())
|
| 27 |
+
print(f"Working in {tmp}")
|
| 28 |
+
|
| 29 |
+
print("Cloning repository...")
|
| 30 |
+
subprocess.check_call([
|
| 31 |
+
"git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")
|
| 32 |
+
])
|
| 33 |
+
|
| 34 |
+
sha = subprocess.check_output([
|
| 35 |
+
"git", "rev-parse", "HEAD"
|
| 36 |
+
], cwd=tmp / "repo", text=True).strip()
|
| 37 |
+
|
| 38 |
+
print(f"Repository SHA: {sha}")
|
| 39 |
+
|
| 40 |
+
repo_path = tmp / "repo"
|
| 41 |
+
|
| 42 |
+
print("Building graph...")
|
| 43 |
+
graph = build_graph_json(
|
| 44 |
+
transformers_dir=repo_path,
|
| 45 |
+
threshold=THRESH,
|
| 46 |
+
multimodal=MULTIMODAL,
|
| 47 |
+
sim_method=SIM_METHOD,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
print("Building timeline...")
|
| 51 |
+
timeline = build_timeline_json(
|
| 52 |
+
transformers_dir=repo_path,
|
| 53 |
+
threshold=THRESH,
|
| 54 |
+
multimodal=MULTIMODAL,
|
| 55 |
+
sim_method=SIM_METHOD,
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
print("Generating HTML...")
|
| 59 |
+
graph_html = generate_html(graph)
|
| 60 |
+
timeline_html = generate_timeline_html(timeline)
|
| 61 |
+
|
| 62 |
+
print(f"Uploading to {CACHE_REPO}...")
|
| 63 |
+
|
| 64 |
+
api = HfApi()
|
| 65 |
+
|
| 66 |
+
key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
|
| 67 |
+
|
| 68 |
+
latest = {
|
| 69 |
+
"sha": sha,
|
| 70 |
+
"updated_utc": datetime.now(timezone.utc).isoformat(),
|
| 71 |
+
"defaults": {
|
| 72 |
+
"sim_method": SIM_METHOD,
|
| 73 |
+
"threshold": THRESH,
|
| 74 |
+
"multimodal": MULTIMODAL
|
| 75 |
+
},
|
| 76 |
+
"paths": {
|
| 77 |
+
"graph_json": f"graph/{key}.json",
|
| 78 |
+
"graph_html": f"graph/{key}.html",
|
| 79 |
+
"timeline_json": f"timeline/{key}.json",
|
| 80 |
+
"timeline_html": f"timeline/{key}.html",
|
| 81 |
+
},
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
files_to_upload = [
|
| 85 |
+
(f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))),
|
| 86 |
+
(f"graph/{key}.html", graph_html),
|
| 87 |
+
(f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))),
|
| 88 |
+
(f"timeline/{key}.html", timeline_html),
|
| 89 |
+
("latest.json", json.dumps(latest, separators=(',', ':'))),
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
for path_in_repo, content in files_to_upload:
|
| 93 |
+
temp_file = tmp / "upload_temp"
|
| 94 |
+
temp_file.write_text(content, encoding="utf-8")
|
| 95 |
+
|
| 96 |
+
api.upload_file(
|
| 97 |
+
path_or_fileobj=str(temp_file),
|
| 98 |
+
path_in_repo=path_in_repo,
|
| 99 |
+
repo_id=CACHE_REPO,
|
| 100 |
+
commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}"
|
| 101 |
+
)
|
| 102 |
+
print(f"Uploaded {path_in_repo}")
|
| 103 |
+
|
| 104 |
+
print(f"Successfully uploaded cache for {key}")
|
| 105 |
+
|
| 106 |
+
if __name__ == "__main__":
|
| 107 |
+
main()
|
modular_graph_and_candidates.py
CHANGED
|
@@ -685,7 +685,6 @@ function updateVisibility() {
|
|
| 685 |
}
|
| 686 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
| 687 |
|
| 688 |
-
const HF_LOGO_URI = "./static/hf-logo.png";
|
| 689 |
const graph = __GRAPH_DATA__;
|
| 690 |
const W = innerWidth, H = innerHeight;
|
| 691 |
const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
|
|
@@ -709,23 +708,7 @@ const node = g.selectAll('g.node')
|
|
| 709 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
| 710 |
|
| 711 |
const baseSel = node.filter(d => d.cls === 'base');
|
| 712 |
-
|
| 713 |
-
baseSel.append('image')
|
| 714 |
-
.attr('href', HF_LOGO_URI)
|
| 715 |
-
.attr('width', 40)
|
| 716 |
-
.attr('height', 40)
|
| 717 |
-
.attr('x', -20)
|
| 718 |
-
.attr('y', -20)
|
| 719 |
-
.on('error', function() {
|
| 720 |
-
console.log('Image failed to load:', HF_LOGO_URI);
|
| 721 |
-
// Fallback to circle
|
| 722 |
-
d3.select(this.parentNode).append('circle')
|
| 723 |
-
.attr('r', 22).attr('fill', '#ffbe0b');
|
| 724 |
-
});
|
| 725 |
-
console.log('Loading logo from:', HF_LOGO_URI);
|
| 726 |
-
}else{
|
| 727 |
-
baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
|
| 728 |
-
}
|
| 729 |
node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
|
| 730 |
|
| 731 |
node.append('text')
|
|
@@ -922,7 +905,6 @@ function updateVisibility() {
|
|
| 922 |
}
|
| 923 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
| 924 |
|
| 925 |
-
const HF_LOGO_URI = "./static/hf-logo.png";
|
| 926 |
const timeline = __TIMELINE_DATA__;
|
| 927 |
const W = innerWidth, H = innerHeight;
|
| 928 |
|
|
@@ -1003,20 +985,7 @@ const node = g.selectAll('g.node')
|
|
| 1003 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
| 1004 |
|
| 1005 |
const baseSel = node.filter(d => d.cls === 'base');
|
| 1006 |
-
|
| 1007 |
-
baseSel.append('image')
|
| 1008 |
-
.attr('href', HF_LOGO_URI)
|
| 1009 |
-
.attr('width', 35)
|
| 1010 |
-
.attr('height', 35)
|
| 1011 |
-
.attr('x', -17.5)
|
| 1012 |
-
.attr('y', -17.5)
|
| 1013 |
-
.on('error', function() {
|
| 1014 |
-
d3.select(this.parentNode).append('circle')
|
| 1015 |
-
.attr('r', 20).attr('fill', '#ffbe0b');
|
| 1016 |
-
});
|
| 1017 |
-
} else {
|
| 1018 |
-
baseSel.append('circle').attr('r', 20).attr('fill', '#ffbe0b');
|
| 1019 |
-
}
|
| 1020 |
node.filter(d => d.cls !== 'base').append('circle').attr('r', 18);
|
| 1021 |
|
| 1022 |
node.append('text')
|
|
|
|
| 685 |
}
|
| 686 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
| 687 |
|
|
|
|
| 688 |
const graph = __GRAPH_DATA__;
|
| 689 |
const W = innerWidth, H = innerHeight;
|
| 690 |
const svg = d3.select('#dependency').call(d3.zoom().on('zoom', e => g.attr('transform', e.transform)));
|
|
|
|
| 708 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
| 709 |
|
| 710 |
const baseSel = node.filter(d => d.cls === 'base');
|
| 711 |
+
baseSel.append('circle').attr('r', d => 22*d.sz).attr('fill', '#ffbe0b');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 712 |
node.filter(d => d.cls !== 'base').append('circle').attr('r', d => 20*d.sz);
|
| 713 |
|
| 714 |
node.append('text')
|
|
|
|
| 905 |
}
|
| 906 |
document.getElementById('toggleRed').addEventListener('change', updateVisibility);
|
| 907 |
|
|
|
|
| 908 |
const timeline = __TIMELINE_DATA__;
|
| 909 |
const W = innerWidth, H = innerHeight;
|
| 910 |
|
|
|
|
| 985 |
.call(d3.drag().on('start', dragStart).on('drag', dragged).on('end', dragEnd));
|
| 986 |
|
| 987 |
const baseSel = node.filter(d => d.cls === 'base');
|
| 988 |
+
baseSel.append('circle').attr('r', 20).attr('fill', '#ffbe0b');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 989 |
node.filter(d => d.cls !== 'base').append('circle').attr('r', 18);
|
| 990 |
|
| 991 |
node.append('text')
|