Spaces:

wilwork
/

X-encoder

Runtime error

App Files Files Community

wilwork commited on Jan 25

Commit

6700bfc

verified ·

1 Parent(s): 0f56dc9

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -42

app.py CHANGED Viewed

@@ -9,71 +9,89 @@ model = AutoModelForSequenceClassification.from_pretrained(model_name)
 model.eval()
 # Function to compute relevance score (in logits) and dynamically adjust threshold
-def get_relevance_score_and_excerpt(query, paragraph, threshold_weight):
-    if not query.strip() or not paragraph.strip():
-        return "Please provide both a query and a document paragraph.", ""
-    # Tokenize the input
-    inputs = tokenizer(query, paragraph, return_tensors="pt", truncation=True, padding=True)
-    with torch.no_grad():
-        output = model(**inputs, output_attentions=True)
-    # Extract logits (no sigmoid applied)
-    logit = output.logits.squeeze().item()
-    base_relevance_score = logit  # Relevance score in logits
-    # Dynamically adjust the attention threshold based on user weight (no relevance score influence)
-    dynamic_threshold = max(0.02, threshold_weight)
-    # Extract attention scores (last layer)
-    attention = output.attentions[-1]
-    attention_scores = attention.mean(dim=1).mean(dim=0)
-    query_tokens = tokenizer.tokenize(query)
-    paragraph_tokens = tokenizer.tokenize(paragraph)
-    query_len = len(query_tokens) + 2  # +2 for special tokens [CLS] and first [SEP]
-    para_start_idx = query_len
-    para_end_idx = len(inputs["input_ids"][0]) - 1
-    if para_end_idx <= para_start_idx:
-        return round(base_relevance_score, 4), "No relevant tokens extracted."
-    para_attention_scores = attention_scores[para_start_idx:para_end_idx, para_start_idx:para_end_idx].mean(dim=0)
-    if para_attention_scores.numel() == 0:
-        return round(base_relevance_score, 4), "No relevant tokens extracted."
-    # Get indices of relevant tokens above dynamic threshold
-    relevant_indices = (para_attention_scores > dynamic_threshold).nonzero(as_tuple=True)[0].tolist()
-    # Reconstruct paragraph with bolded relevant tokens using HTML tags
-    highlighted_text = ""
-    for idx, token in enumerate(paragraph_tokens):
-        if idx in relevant_indices:
-            highlighted_text += f"<b>{token}</b> "
-        else:
-            highlighted_text += f"{token} "
-    highlighted_text = tokenizer.convert_tokens_to_string(highlighted_text.split())
-    return round(base_relevance_score, 4), highlighted_text
-# Define Gradio interface with a slider for threshold adjustment
 interface = gr.Interface(
     fn=get_relevance_score_and_excerpt,
     inputs=[
         gr.Textbox(label="Query", placeholder="Enter your search query..."),
-        gr.Textbox(label="Document Paragraph", placeholder="Enter a paragraph to match..."),
         gr.Slider(minimum=0.02, maximum=0.5, value=0.1, step=0.01, label="Attention Threshold")
     ],
     outputs=[
-        gr.Textbox(label="Relevance Score (Logits)"),
-        gr.HTML(label="Highlighted Document Paragraph")
     ],
-    title="Cross-Encoder Attention Highlighting",
-    description="Adjust the attention threshold to control token highlighting sensitivity.",
     allow_flagging="never",
     live=True
 )

 model.eval()
 # Function to compute relevance score (in logits) and dynamically adjust threshold
+def get_relevance_score_and_excerpt(query, *paragraphs, threshold_weight):
+    if not query.strip() or not any(p.strip() for p in paragraphs):
+        return "Please provide both a query and at least one document paragraph.", ""
+    ranked_paragraphs = []
+    # Process each paragraph and calculate its logits and highlighted text
+    for paragraph in paragraphs:
+        # Tokenize the input
+        inputs = tokenizer(query, paragraph, return_tensors="pt", truncation=True, padding=True)
+        with torch.no_grad():
+            output = model(**inputs, output_attentions=True)
+        # Extract logits (no sigmoid applied)
+        logit = output.logits.squeeze().item()
+        base_relevance_score = logit  # Relevance score in logits
+        # Dynamically adjust the attention threshold based on user weight
+        dynamic_threshold = max(0.02, threshold_weight)
+        # Extract attention scores (last layer)
+        attention = output.attentions[-1]
+        attention_scores = attention.mean(dim=1).mean(dim=0)
+        query_tokens = tokenizer.tokenize(query)
+        paragraph_tokens = tokenizer.tokenize(paragraph)
+        query_len = len(query_tokens) + 2  # +2 for special tokens [CLS] and first [SEP]
+        para_start_idx = query_len
+        para_end_idx = len(inputs["input_ids"][0]) - 1
+        if para_end_idx <= para_start_idx:
+            continue
+        para_attention_scores = attention_scores[para_start_idx:para_end_idx, para_start_idx:para_end_idx].mean(dim=0)
+        if para_attention_scores.numel() == 0:
+            continue
+        # Get indices of relevant tokens above dynamic threshold
+        relevant_indices = (para_attention_scores > dynamic_threshold).nonzero(as_tuple=True)[0].tolist()
+        # Reconstruct paragraph with bolded relevant tokens using HTML tags
+        highlighted_text = ""
+        for idx, token in enumerate(paragraph_tokens):
+            if idx in relevant_indices:
+                highlighted_text += f"<b>{token}</b> "
+            else:
+                highlighted_text += f"{token} "
+        highlighted_text = tokenizer.convert_tokens_to_string(highlighted_text.split())
+        ranked_paragraphs.append({
+            "logit": logit,
+            "highlighted_text": highlighted_text
+        })
+    # Sort paragraphs by logit (descending)
+    ranked_paragraphs.sort(key=lambda x: x["logit"], reverse=True)
+    # Prepare output
+    relevance_scores = [round(p["logit"], 4) for p in ranked_paragraphs]
+    highlighted_texts = [p["highlighted_text"] for p in ranked_paragraphs]
+    return "\n".join([f"Relevance Score: {score}" for score in relevance_scores]), "\n\n".join(highlighted_texts)
+# Define Gradio interface with a slider for threshold adjustment and ability to add multiple paragraphs
 interface = gr.Interface(
     fn=get_relevance_score_and_excerpt,
     inputs=[
         gr.Textbox(label="Query", placeholder="Enter your search query..."),
+        gr.Textbox(label="Document Paragraph 1", placeholder="Enter a paragraph to match...", lines=4),
+        gr.Textbox(label="Document Paragraph 2 (optional)", placeholder="Enter another paragraph...", lines=4),
+        gr.Textbox(label="Document Paragraph 3 (optional)", placeholder="Enter another paragraph...", lines=4),
         gr.Slider(minimum=0.02, maximum=0.5, value=0.1, step=0.01, label="Attention Threshold")
     ],
     outputs=[
+        gr.Textbox(label="Relevance Scores (Logits)"),
+        gr.HTML(label="Highlighted Document Paragraphs")
     ],
+    title="Cross-Encoder Attention Highlighting with Reranking",
+    description="Adjust the attention threshold to control token highlighting sensitivity. Multiple paragraphs can be added and reranked based on their logits.",
     allow_flagging="never",
     live=True
 )