seier-brightside commited on
Commit
d26ab54
·
verified ·
1 Parent(s): 7167898

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -0
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from span_marker import SpanMarkerModel, SpanMarkerTrainer, SpanMarkerTrainingArguments
4
+ from datasets import Dataset
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ def entrenar(jsonl_file):
8
+ # Cargar JSONL
9
+ raw = [json.loads(l) for l in jsonl_file.splitlines()]
10
+
11
+ dataset = []
12
+ for item in raw:
13
+ texto = item["data"]["texto"]
14
+ anot = item["annotations"][0]
15
+
16
+ entidades = []
17
+ for ent in anot["result"]:
18
+ entidades.append({
19
+ "start": ent["value"]["start"],
20
+ "end": ent["value"]["end"],
21
+ "label": ent["value"]["labels"][0]
22
+ })
23
+
24
+ dataset.append({"text": texto, "entities": entidades})
25
+
26
+ # Extraer etiquetas
27
+ labels = sorted(list({e["label"] for d in dataset for e in d["entities"]}))
28
+ labels.insert(0, "O") # obligatorio
29
+
30
+ # Train/test
31
+ train, test = train_test_split(dataset, test_size=0.2, random_state=42)
32
+ train_ds = Dataset.from_list(train)
33
+ test_ds = Dataset.from_list(test)
34
+
35
+ # Modelo
36
+ model = SpanMarkerModel.from_pretrained(
37
+ "PlanTL-GOB-ES/roberta-base-biomedical-clinical-es",
38
+ labels=labels
39
+ )
40
+
41
+ # Argumentos
42
+ args = SpanMarkerTrainingArguments(
43
+ output_dir="modelo_final",
44
+ learning_rate=5e-5,
45
+ per_device_train_batch_size=2,
46
+ per_device_eval_batch_size=2,
47
+ num_train_epochs=3,
48
+ logging_steps=10,
49
+ save_strategy="epoch",
50
+ evaluation_strategy="epoch"
51
+ )
52
+
53
+ # Entrenador
54
+ trainer = SpanMarkerTrainer(
55
+ model=model,
56
+ args=args,
57
+ train_dataset=train_ds,
58
+ eval_dataset=test_ds
59
+ )
60
+
61
+ trainer.train()
62
+
63
+ return "Entrenamiento completado. El modelo está en /modelo_final"
64
+
65
+ ui = gr.Interface(
66
+ fn=entrenar,
67
+ inputs=gr.File(label="Sube tu archivo JSONL exportado de Label Studio"),
68
+ outputs="text",
69
+ title="Entrenamiento NER Médico con SpanMarker"
70
+ )
71
+
72
+ ui.launch()