Ubuntu commited on
Commit
0108eb5
·
1 Parent(s): 29807ce

Commit initial quote caster ui

Browse files
Files changed (5) hide show
  1. Dockerfile +24 -0
  2. README.md +4 -13
  3. app.py +66 -0
  4. requirements.txt +5 -0
  5. space.yaml +1 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Install system dependencies
4
+ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
5
+
6
+ # Set working directory
7
+ WORKDIR /app
8
+
9
+ # Copy project files
10
+ COPY . .
11
+
12
+ # Install Python packages
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Download Hugging Face model weights on build (optional to avoid slow startup)
16
+ RUN python -c "from transformers import AutoTokenizer, AutoModel; \
17
+ AutoTokenizer.from_pretrained('aNameNobodyChose/quote-caster-encoder'); \
18
+ AutoModel.from_pretrained('aNameNobodyChose/quote-caster-encoder')"
19
+
20
+ # Gradio default port
21
+ EXPOSE 7860
22
+
23
+ # Run the Gradio app
24
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,14 +1,5 @@
1
- ---
2
- title: Quote Caster
3
- emoji: 👁
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.25.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Assigns dilogues to speakers in a story
12
- ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
1
+ # Build quote caster gradio image
2
+ docker build -t quote-caster-gradio .
 
 
 
 
 
 
 
 
 
 
3
 
4
+ # Run Quote caster
5
+ docker run --rm -it -p 7860:7860 quote-caster-gradio
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModel, AutoTokenizer
3
+ from sklearn.cluster import KMeans
4
+ from kneed import KneeLocator
5
+ import torch
6
+ import json
7
+
8
+ def encode_quote(context: str, dialogue: str, tokenizer, model) -> torch.Tensor:
9
+ """
10
+ Encode a single quote using [CLS] token from BERT.
11
+ """
12
+ text = f"{context} [SEP] {dialogue}"
13
+ inputs = tokenizer(
14
+ text,
15
+ return_tensors="pt",
16
+ truncation=True,
17
+ padding=True,
18
+ max_length=512
19
+ )
20
+ outputs = model(**inputs)
21
+ cls_embedding = outputs.last_hidden_state[:, 0, :] # [CLS] token
22
+ return cls_embedding.squeeze(0)
23
+
24
+ def load_encoder():
25
+ tokenizer = AutoTokenizer.from_pretrained("aNameNobodyChose/quote-caster-encoder")
26
+ model = AutoModel.from_pretrained("aNameNobodyChose/quote-caster-encoder")
27
+ model.eval()
28
+ return tokenizer, model
29
+
30
+ def embed_quotes(data, tokenizer, model):
31
+ embeddings = []
32
+ for ex in data:
33
+ emb = encode_quote(ex["context"], ex["quote"], tokenizer, model)
34
+ embeddings.append(emb)
35
+ return torch.stack(embeddings)
36
+
37
+ def auto_k_via_elbow(embeddings, max_k=10):
38
+ X = embeddings.detach().numpy()
39
+ inertias = []
40
+ for k in range(1, max_k + 1):
41
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
42
+ kmeans.fit(X)
43
+ inertias.append(kmeans.inertia_)
44
+ knee = KneeLocator(range(1, max_k + 1), inertias, curve="convex", direction="decreasing")
45
+ return knee.knee or 2
46
+
47
+ def predict(input_text):
48
+ try:
49
+ data = json.loads(input_text)
50
+ tokenizer, model = load_encoder()
51
+ embeddings = embed_quotes(data, tokenizer, model)
52
+ k = auto_k_via_elbow(embeddings)
53
+ labels = KMeans(n_clusters=k).fit_predict(embeddings.detach().numpy())
54
+ for quote, cluster_id in zip(data, labels):
55
+ quote["predicted_speaker"] = f"SPEAKER_{cluster_id}"
56
+ return json.dumps(data, indent=2, ensure_ascii=False)
57
+ except Exception as e:
58
+ return f"❌ Error: {e}"
59
+
60
+ gr.Interface(
61
+ fn=predict,
62
+ inputs=gr.Textbox(lines=20, label="Paste quote-context JSON"),
63
+ outputs="textbox",
64
+ title="🗣️ QuoteCaster - Speaker Attribution from Quotes",
65
+ description="Paste a list of quotes with their context and get clustered speaker predictions using a transformer-based model."
66
+ ).launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ transformers==4.39.3
4
+ scikit-learn==1.3.2
5
+ kneed
space.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ sdk: docker