File size: 1,779 Bytes
458f7bb
 
 
5a79df9
458f7bb
 
30de3db
458f7bb
30de3db
 
458f7bb
 
30de3db
458f7bb
 
c09fd61
 
 
458f7bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 2. Load your CSV directly from the dataset repo
from datasets import load_dataset
import pandas as pd

# Your CSV is in the dataset repo
ds = load_dataset(
    "Jendersen/bible_welsh_cornish_breton_english",
    data_files="parallel_corpus.csv"
)["train"]

df = ds.to_pandas()
print(f"Loaded {len(df)} rows, columns: {list(df.columns)}")

# 3. Helper: keep only non-empty, non-punctuation
import string
def is_valid(t):
    return bool(t and t.strip() and t.strip() not in string.punctuation)

# 4. Build pairs (same logic as your script)
pairs = []

# Breton (br)
br = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["koad21_text"]), axis=1)]
for _, r in br.iterrows():
    pairs.append({
        "text": f"translate English to br: {r['niv_text']}",
        "target": r["koad21_text"]
    })

# Cornish (kw) – column is "abk_text"
kw = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["abk_text"]), axis=1)]
for _, r in kw.iterrows():
    pairs.append({
        "text": f"translate English to kw: {r['niv_text']}",
        "target": r["abk_text"]
    })

# Welsh (cy)
cy = df[df.apply(lambda r: is_valid(r["niv_text"]) and is_valid(r["bcnda_text"]), axis=1)]
for _, r in cy.iterrows():
    pairs.append({
        "text": f"translate English to cy: {r['niv_text']}",
        "target": r["bcnda_text"]
    })

final = pd.DataFrame(pairs)
print(f"Total valid pairs: {len(final)}")

# 5. Train / validation split
final = final.sample(frac=1, random_state=42).reset_index(drop=True)
split = int(0.8 * len(final))
train_df = final.iloc[:split]
valid_df = final.iloc[split:]

# 6. Save
train_df.to_csv("train.csv", index=False)
valid_df.to_csv("valid.csv", index=False)

print(f"Saved train.csv ({len(train_df)} rows) and valid.csv ({len(valid_df)} rows)")