Jendersen commited on
Commit
458f7bb
·
verified ·
1 Parent(s): bdd9cbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -149
app.py CHANGED
@@ -1,158 +1,62 @@
1
- # train.py
2
- #!/usr/bin/env python
3
- import os
4
- import json
5
- import string
6
- import pandas as pd
7
- import evaluate
8
- import numpy as np
9
- from datasets import load_dataset, DatasetDict
10
- from transformers import (
11
- AutoTokenizer, AutoModelForSeq2SeqLM,
12
- Seq2SeqTrainingArguments, Seq2SeqTrainer,
13
- DataCollatorForSeq2Seq
14
- )
15
- from huggingface_hub import login
16
-
17
- # -------------------------------------------------
18
- # 0. HF login (set HF_TOKEN in Secrets)
19
- # -------------------------------------------------
20
 
21
- if "HF_TOKEN" in os.environ:
22
- login(token=os.environ["HF_TOKEN"])
23
- print("Logged in to Hugging Face via HF_TOKEN")
24
- else:
25
- raise ValueError("HF_TOKEN not found! Add it in Space Secrets."
26
- )
27
-
28
- # -------------------------------------------------
29
- # 1. Load dataset from Hub
30
- # -------------------------------------------------
31
- dataset = load_dataset("Jendersen/bible_welsh_cornish_breton_english")
32
 
33
- # Actually we load the JSON file that was uploaded:
34
- raw = load_dataset(
35
  "Jendersen/bible_welsh_cornish_breton_english",
36
- data_files="parallel_corpus.json"
37
  )["train"]
38
 
39
- # Debug: See what columns we actually have
40
- print("Columns in dataset:", raw.column_names)
41
- print("First example:", raw[0])
42
-
43
- # Convert directly to DataFrame — no json.loads needed!
44
- df = pd.DataFrame(raw)
45
 
46
- print(f"Loaded {len(df)} verses with columns: {list(df.columns)}")
47
-
48
- # -------------------------------------------------
49
- # 2. Build English → {br, abk, cy}
50
- # -------------------------------------------------
51
  def is_valid(t):
52
  return bool(t and t.strip() and t.strip() not in string.punctuation)
53
 
54
- br = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["koad21_text"]), axis=1)][["niv_text","koad21_text"]].rename(columns={"niv_text":"en","koad21_text":"target"})
55
- br["language"] = "br"
56
-
57
- abk = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["abk_text"]), axis=1)][["niv_text","abk_text"]].rename(columns={"niv_text":"en","abk_text":"target"})
58
- abk["language"] = "abk"
59
-
60
- cy = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["bcnda_text"]), axis=1)][["niv_text","bcnda_text"]].rename(columns={"niv_text":"en","bcnda_text":"target"})
61
- cy["language"] = "cy"
62
-
63
- combined = pd.concat([br, abk, cy], ignore_index=True)
64
- print(f"Total examples: {len(combined)} (br:{len(br)}, abk:{len(abk)}, cy:{len(cy)})")
65
-
66
- # -------------------------------------------------
67
- # 3. Train / test split
68
- # -------------------------------------------------
69
- from datasets import Dataset
70
- ds = Dataset.from_pandas(combined).train_test_split(test_size=0.2, seed=42)
71
- raw_datasets = DatasetDict({"train": ds["train"], "test": ds["test"]})
72
-
73
- # -------------------------------------------------
74
- # 4. Tokenizer & Model
75
- # -------------------------------------------------
76
- model_name = "t5-small"
77
- tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
78
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
79
-
80
- # -------------------------------------------------
81
- # 5. Pre-process
82
- # -------------------------------------------------
83
- MAX_LEN = 96
84
-
85
- def preprocess(examples):
86
- inputs = [f"translate English to {lang}: {en}"
87
- for lang, en in zip(examples["language"], examples["en"])]
88
- targets = examples["target"]
89
- model_inputs = tokenizer(inputs, max_length=MAX_LEN, truncation=True, padding="max_length")
90
- labels = tokenizer(targets, max_length=MAX_LEN, truncation=True, padding="max_length").input_ids
91
- model_inputs["labels"] = labels
92
- return model_inputs
93
-
94
- tokenized = raw_datasets.map(preprocess, batched=True, remove_columns=raw_datasets["train"].column_names)
95
-
96
- # -------------------------------------------------
97
- # 6. Data collator & metric
98
- # -------------------------------------------------
99
- data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
100
-
101
- metric = evaluate.load("sacrebleu")
102
-
103
- def compute_metrics(eval_preds):
104
- preds, labels = eval_preds
105
- if isinstance(preds, tuple):
106
- preds = preds[0]
107
- decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
108
- labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
109
- decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
110
- decoded_preds = [p.strip() for p in decoded_preds]
111
- decoded_labels = [[l.strip()] for l in decoded_labels]
112
- result = metric.compute(predictions=decoded_preds, references=decoded_labels)
113
- return {"bleu": result["score"]}
114
-
115
- # -------------------------------------------------
116
- # 7. Training args
117
- # -------------------------------------------------
118
- training_args = Seq2SeqTrainingArguments(
119
- output_dir="mt5-celtic-finetuned",
120
- eval_strategy="epoch",
121
- save_strategy="epoch",
122
- learning_rate=3e-4,
123
- per_device_train_batch_size=16,
124
- per_device_eval_batch_size=16,
125
- weight_decay=0.01,
126
- num_train_epochs=3,
127
- predict_with_generate=True,
128
- fp16=False, # GPU
129
- #bf16=True, # TPU (auto-enabled if on TPU)
130
- logging_steps=100,
131
- report_to="wandb", # optional
132
- push_to_hub=True,
133
- hub_model_id="Jendersen/mt5-celtic-en-br-kw-cy",
134
- hub_strategy="end",
135
- load_best_model_at_end=True,
136
- metric_for_best_model="bleu",
137
- )
138
-
139
- # -------------------------------------------------
140
- # 8. Trainer
141
- # -------------------------------------------------
142
- trainer = Seq2SeqTrainer(
143
- model=model,
144
- args=training_args,
145
- train_dataset=tokenized["train"],
146
- eval_dataset=tokenized["test"],
147
- tokenizer=tokenizer,
148
- data_collator=data_collator,
149
- compute_metrics=compute_metrics,
150
- )
151
-
152
- trainer.train()
153
-
154
- # -------------------------------------------------
155
- # 9. Final push
156
- # -------------------------------------------------
157
- trainer.push_to_hub("doubleyooz/mt5-celtic-en-br-kw-cy")
158
- print("Model pushed to Hub!")
 
1
+ # 1. Install
2
+ !pip install -q pandas datasets huggingface_hub
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ # 2. Load your CSV directly from the dataset repo
5
+ from datasets import load_dataset
6
+ import pandas as pd
 
 
 
 
 
 
 
 
7
 
8
+ # Your CSV is in the dataset repo
9
+ ds = load_dataset(
10
  "Jendersen/bible_welsh_cornish_breton_english",
11
+ data_files="parallel_corpus.csv"
12
  )["train"]
13
 
14
+ df = ds.to_pandas()
15
+ print(f"Loaded {len(df)} rows, columns: {list(df.columns)}")
 
 
 
 
16
 
17
+ # 3. Helper: keep only non-empty, non-punctuation
18
+ import string
 
 
 
19
  def is_valid(t):
20
  return bool(t and t.strip() and t.strip() not in string.punctuation)
21
 
22
+ # 4. Build pairs (same logic as your script)
23
+ pairs = []
24
+
25
+ # Breton (br)
26
+ br = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["koad21_text"]), axis=1)]
27
+ for _, r in br.iterrows():
28
+ pairs.append({
29
+ "text": f"translate English to br: {r['niv_text']}",
30
+ "target": r["koad21_text"]
31
+ })
32
+
33
+ # Cornish (kw) – column is "abk_text"
34
+ kw = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["abk_text"]), axis=1)]
35
+ for _, r in kw.iterrows():
36
+ pairs.append({
37
+ "text": f"translate English to kw: {r['niv_text']}",
38
+ "target": r["abk_text"]
39
+ })
40
+
41
+ # Welsh (cy)
42
+ cy = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["bcnda_text"]), axis=1)]
43
+ for _, r in cy.iterrows():
44
+ pairs.append({
45
+ "text": f"translate English to cy: {r['niv_text']}",
46
+ "target": r["bcnda_text"]
47
+ })
48
+
49
+ final = pd.DataFrame(pairs)
50
+ print(f"Total valid pairs: {len(final)}")
51
+
52
+ # 5. Train / validation split
53
+ final = final.sample(frac=1, random_state=42).reset_index(drop=True)
54
+ split = int(0.8 * len(final))
55
+ train_df = final.iloc[:split]
56
+ valid_df = final.iloc[split:]
57
+
58
+ # 6. Save
59
+ train_df.to_csv("train.csv", index=False)
60
+ valid_df.to_csv("valid.csv", index=False)
61
+
62
+ print(f"Saved train.csv ({len(train_df)} rows) and valid.csv ({len(valid_df)} rows)")