import gradio as gr import json from span_marker import SpanMarkerModel, SpanMarkerTrainer from span_marker import SpanMarkerTrainingArguments from datasets import Dataset from sklearn.model_selection import train_test_split def entrenar(jsonl_file): raw = [json.loads(l) for l in jsonl_file.splitlines()] dataset = [] for item in raw: texto = item["data"]["texto"] anot = item["annotations"][0] entidades = [] for ent in anot["result"]: entidades.append({ "start": ent["value"]["start"], "end": ent["value"]["end"], "label": ent["value"]["labels"][0] }) dataset.append({"text": texto, "entities": entidades}) # Extraer etiquetas labels = sorted(list({e["label"] for d in dataset for e in d["entities"]})) labels.insert(0, "O") # Datasets Hugging Face train, test = train_test_split(dataset, test_size=0.2, random_state=42) train_ds = Dataset.from_list(train) test_ds = Dataset.from_list(test) # Modelo model = SpanMarkerModel.from_pretrained( "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es", labels=labels ) # Args args = SpanMarkerTrainingArguments( output_dir="modelo_final", num_train_epochs=3, learning_rate=5e-5, per_device_train_batch_size=2, per_device_eval_batch_size=2, save_strategy="epoch", evaluation_strategy="epoch", logging_steps=10 ) trainer = SpanMarkerTrainer( model=model, args=args, train_dataset=train_ds, eval_dataset=test_ds ) trainer.train() return "¡Entrenamiento completado! Modelo guardado en /modelo_final" ui = gr.Interface( fn=entrenar, inputs=gr.File(label="Sube tu JSONL exportado de Label Studio"), outputs="text", title="Entrenamiento NER Médico con SpanMarker" ) ui.launch()