|
|
import gradio as gr |
|
|
from transformers import AutoTokenizer |
|
|
from optimum.onnxruntime import ORTModelForCausalLM, ORTOptions |
|
|
|
|
|
|
|
|
|
|
|
model_name = "microsoft/Phi-3-mini-4k-instruct-ONNX" |
|
|
|
|
|
|
|
|
|
|
|
options = ORTOptions(enable_int8=True, enable_dynamic_quantization=True) |
|
|
model = ORTModelForCausalLM.from_pretrained(model_name, from_transformers=True, ort_options=options) |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def generate_text(input_texts): |
|
|
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=32) |
|
|
outputs = model.generate( |
|
|
**inputs, |
|
|
max_new_tokens=8, |
|
|
temperature=0.1, |
|
|
do_sample=False, |
|
|
num_beams=1, |
|
|
early_stopping=True |
|
|
) |
|
|
return tokenizer.batch_decode(outputs, skip_special_tokens=True) |
|
|
|
|
|
|
|
|
|
|
|
iface = gr.Interface(fn=generate_text, inputs=gr.Textbox(multiline=True), outputs="text") |
|
|
iface.launch() |