Spaces:

B-patents
/

patent-bert

Build error

App Files Files Community

danseith commited on Feb 25, 2023

Commit

ca69fee

1 Parent(s): e3a2d6f

Added dummy temp slider and output text box with new input.

Browse files

Files changed (1) hide show

app.py +75 -40

app.py CHANGED Viewed

@@ -1,15 +1,14 @@
 import gradio as gr
 import numpy as np
 import torch
-from transformers import pipeline, Pipeline
 from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
-from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM
-unmasker = pipeline("fill-mask", model="anferico/bert-for-patents")
 # unmasker = pipeline("temp-scale", model="anferico/bert-for-patents")
-example = 'A crustless [MASK] made from two slices of baked bread'
-example_dict = {}
-example_dict['input_ids'] = example
 def add_mask(text, size=1):
     split_text = text.split()
@@ -20,7 +19,49 @@ def add_mask(text, size=1):
 class TempScalePipe(FillMaskPipeline):
-    def postprocess(self, model_outputs, top_k=3, target_ids=None):
         # Cap top_k if there are targets
         if target_ids is not None and target_ids.shape[0] < top_k:
             top_k = target_ids.shape[0]
@@ -30,14 +71,16 @@ class TempScalePipe(FillMaskPipeline):
         masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
         # Fill mask pipeline supports only one ${mask_token} per sample
-        logits = outputs[0, masked_index, :] / 1e1
         probs = logits.softmax(dim=-1)
-        indices = torch.multinomial(probs, num_samples=3)
-        probs = probs[indices]
         if target_ids is not None:
             probs = probs[..., target_ids]
-        values, predictions = probs.topk(top_k)
         result = []
         single_mask = values.shape[0] == 1
@@ -69,41 +112,33 @@ PIPELINE_REGISTRY.register_pipeline(
     pipeline_class=TempScalePipe,
     pt_model=AutoModelForMaskedLM,
 )
-def unmask(text):
     # text = add_mask(text)
-    res = unmasker(text)
     out = {item["token_str"]: item["score"] for item in res}
-    return out
 textbox = gr.Textbox(label="Type language here", lines=5)
-# import gradio as gr
-from transformers import pipeline, Pipeline
-# unmasker = pipeline("fill-mask", model="anferico/bert-for-patents")
-#
-#
-#
-#
-# def unmask(text):
-#     text = add_mask(text)
-#     res = unmasker(text)
-#     out = {item["token_str"]: item["score"] for item in res}
-#     return out
-#
-#
-# textbox = gr.Textbox(label="Type language here", lines=5)
-#
 demo = gr.Interface(
     fn=unmask,
-    inputs=textbox,
-    outputs="label",
-    examples=[example],
 )
 demo.launch()

 import gradio as gr
 import numpy as np
 import torch
+from transformers import pipeline
 from transformers.pipelines import PIPELINE_REGISTRY, FillMaskPipeline
+from transformers import AutoModelForMaskedLM
 # unmasker = pipeline("temp-scale", model="anferico/bert-for-patents")
+example = 'A crustless [MASK] made from two slices of baked bread.'
+example = 'The invention provides a method for altering or modifying [MASK] of one or more gene products.'
+example = 'The graphite [MASK] is composed of a two-dimensional hexagonal lattice of carbon atoms.'
 def add_mask(text, size=1):
     split_text = text.split()
 class TempScalePipe(FillMaskPipeline):
+    def _sanitize_parameters(self, top_k=None, targets=None, temp=None):
+        postprocess_params = {}
+        if targets is not None:
+            target_ids = self.get_target_ids(targets, top_k)
+            postprocess_params["target_ids"] = target_ids
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if temp is not None:
+            postprocess_params["temp"] = temp
+        return {}, {}, postprocess_params
+    def __call__(self, inputs, *args, **kwargs):
+        """
+        Fill the masked token in the text(s) given as inputs.
+        Args:
+            args (`str` or `List[str]`):
+                One or several texts (or one list of prompts) with masked tokens.
+            targets (`str` or `List[str]`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
+        Return:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
+            - **score** (`float`) -- The corresponding probability.
+            - **token** (`int`) -- The predicted token id (to replace the masked one).
+            - **token** (`str`) -- The predicted token (to replace the masked one).
+        """
+        outputs = super().__call__(inputs, **kwargs)
+        if isinstance(inputs, list) and len(inputs) == 1:
+            return outputs[0]
+        return outputs
+    def postprocess(self, model_outputs, top_k=10, target_ids=None, temp=1):
         # Cap top_k if there are targets
         if target_ids is not None and target_ids.shape[0] < top_k:
             top_k = target_ids.shape[0]
         masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
         # Fill mask pipeline supports only one ${mask_token} per sample
+        logits = outputs[0, masked_index, :] / 1.2
         probs = logits.softmax(dim=-1)
+        sampling = False
+        if sampling:
+            predictions = torch.multinomial(probs, num_samples=3)
+            values = probs[0, predictions]
         if target_ids is not None:
             probs = probs[..., target_ids]
+        if not sampling:
+            values, predictions = probs.topk(top_k)
         result = []
         single_mask = values.shape[0] == 1
     pipeline_class=TempScalePipe,
     pt_model=AutoModelForMaskedLM,
 )
+scrambler = pipeline("temp-scale", model="anferico/bert-for-patents")
+def unmask(text, temp):
     # text = add_mask(text)
+    split_text = text.split()
+    res = scrambler(text)
+    mask_pos = [i for i, t in enumerate(split_text) if 'MASK' in t][0]
     out = {item["token_str"]: item["score"] for item in res}
+    score_to_str = {out[k]:k for k in out.keys()}
+    print(score_to_str)
+    print(out)
+    score_list = list(score_to_str.keys())
+    idx = np.argmax(np.random.multinomial(1, score_list, 1))
+    score = score_list[idx]
+    new_token = score_to_str[score]
+    split_text[mask_pos] = new_token
+    return out, ' '.join(split_text)
 textbox = gr.Textbox(label="Type language here", lines=5)
+textbox2 = gr.Textbox(placeholder="Type here...", lines=4)
+temp_slider = gr.Slider(1.0, 1.5, value=1.0, label='Creativity')
 demo = gr.Interface(
     fn=unmask,
+    inputs=[textbox, temp_slider],
+    outputs=["label", textbox2],
+    examples=[[example, 1.2]],
 )
 demo.launch()