|
|
|
|
|
from datasets import load_dataset |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
ds = load_dataset( |
|
|
"Jendersen/bible_welsh_cornish_breton_english", |
|
|
data_files="parallel_corpus.csv" |
|
|
)["train"] |
|
|
|
|
|
df = ds.to_pandas() |
|
|
print(f"Loaded {len(df)} rows, columns: {list(df.columns)}") |
|
|
|
|
|
|
|
|
import string |
|
|
def is_valid(t): |
|
|
return bool(t and t.strip() and t.strip() not in string.punctuation) |
|
|
|
|
|
|
|
|
pairs = [] |
|
|
|
|
|
|
|
|
br = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["koad21_text"]), axis=1)] |
|
|
for _, r in br.iterrows(): |
|
|
pairs.append({ |
|
|
"text": f"translate English to br: {r['niv_text']}", |
|
|
"target": r["koad21_text"] |
|
|
}) |
|
|
|
|
|
|
|
|
kw = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["abk_text"]), axis=1)] |
|
|
for _, r in kw.iterrows(): |
|
|
pairs.append({ |
|
|
"text": f"translate English to kw: {r['niv_text']}", |
|
|
"target": r["abk_text"] |
|
|
}) |
|
|
|
|
|
|
|
|
cy = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["bcnda_text"]), axis=1)] |
|
|
for _, r in cy.iterrows(): |
|
|
pairs.append({ |
|
|
"text": f"translate English to cy: {r['niv_text']}", |
|
|
"target": r["bcnda_text"] |
|
|
}) |
|
|
|
|
|
final = pd.DataFrame(pairs) |
|
|
print(f"Total valid pairs: {len(final)}") |
|
|
|
|
|
|
|
|
final = final.sample(frac=1, random_state=42).reset_index(drop=True) |
|
|
split = int(0.8 * len(final)) |
|
|
train_df = final.iloc[:split] |
|
|
valid_df = final.iloc[split:] |
|
|
|
|
|
|
|
|
train_df.to_csv("train.csv", index=False) |
|
|
valid_df.to_csv("valid.csv", index=False) |
|
|
|
|
|
print(f"Saved train.csv ({len(train_df)} rows) and valid.csv ({len(valid_df)} rows)") |