Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| from transformers import DataCollatorForSeq2Seq | |
| import evaluate | |
| import numpy as np | |
| from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer | |
| app = FastAPI() | |
| def summarize(): | |
| # Example: Loading a dataset as part of the API | |
| billsum = load_dataset("billsum", split="ca_test") | |
| # import pandas as pd | |
| # df = pd.read_csv("squad_sample_train.tsv", sep="\t") | |
| # print(df.head()) # Debugging step | |
| # return {"Hello": "World!", "dataset_length": len(billsum)} | |
| # return df.head() | |
| checkpoint = "google-t5/t5-small" | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| prefix = "summarize: " | |
| def preprocess_function(examples): | |
| inputs = [prefix + doc for doc in examples["text"]] | |
| model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
| labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| tokenized_billsum = billsum.map(preprocess_function, batched=True) | |
| data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint) | |
| rouge = evaluate.load("rouge") | |
| def compute_metrics(eval_pred): | |
| predictions, labels = eval_pred | |
| decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) | |
| labels = np.where(labels != -100, labels, tokenizer.pad_token_id) | |
| decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) | |
| result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True) | |
| prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions] | |
| result["gen_len"] = np.mean(prediction_lens) | |
| return {k: round(v, 4) for k, v in result.items()} | |
| model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) | |
| return data_collator | |
| # return type(tokenized_billsum) | |
| """from fastapi import FastAPI | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| app = FastAPI() | |
| #@app.get("/") | |
| # Load dataset and tokenizer | |
| billsum = load_dataset("billsum", split="ca_test") # Load a small sample | |
| tokenizer = AutoTokenizer.from_pretrained("t5-small") | |
| prefix = "summarize: " # Example prefix for text generation | |
| @app.get("/") | |
| def preprocess_function(examples): | |
| inputs = [prefix + doc for doc in examples["text"]] | |
| model_inputs = tokenizer(inputs, max_length=1024, truncation=True) | |
| labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True) | |
| model_inputs["labels"] = labels["input_ids"] | |
| return model_inputs | |
| #@app.get("/") | |
| def get_tokenized_data(): | |
| tokenized_billsum = billsum.map(preprocess_function, batched=True) | |
| # Convert to list of dictionaries | |
| json_serializable_output = tokenized_billsum.to_pandas().to_dict(orient="records") | |
| return {"tokenized_data": json_serializable_output} # Ensure JSON format""" | |