bite-the-byte
/

byt5-small-deASCIIfy-TR

Token Classification

Model card Files Files and versions

emircanerol commited on May 9, 2024

Commit

67e22a8

·

verified ·

1 Parent(s): 1fa24d0

Update README.md

Files changed (1) hide show

README.md +89 -3

README.md CHANGED Viewed

@@ -1,3 +1,89 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+datasets:
+- cc100
+language:
+- tr
+library_name: peft
+pipeline_tag: token-classification
+---
+```python
+from peft import PeftModel, prepare_model_for_kbit_training
+from transformers import T5ForTokenClassification, BitsAndBytesConfig
+import torch
+model_id = "google/byt5-small"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+)
+model = T5ForTokenClassification.from_pretrained(model_id,
+                                                num_labels=2,
+                                                torch_dtype=torch.bfloat16,
+                                                quantization_config=bnb_config,
+                                                device_map="auto",)
+model = prepare_model_for_kbit_training(model)
+model = PeftModel.from_pretrained(model, 'bite-the-byte/byt5-small-deASCIIfy-TR')
+def test_mask(data):
+    """
+    Masks the padded tokens in the input.
+    Args:
+        data (list): List of strings.
+    Returns:
+        dataset (list): List of dictionaries.
+    """
+    dataset = list()
+    for sample in data:
+        new_sample = dict()
+        input_tokens = [i + 3 for i in sample.encode('utf-8')]
+        input_tokens.append(0) # eos token
+        new_sample['input_ids'] = torch.tensor([input_tokens], dtype=torch.int64)
+        # Create attention mask
+        attention_mask = [1] * len(input_tokens)  # Attend to all tokens
+        new_sample['attention_mask'] = torch.tensor([attention_mask], dtype=torch.int64)
+        dataset.append(new_sample)
+    return dataset
+def rewrite(model, data):
+    """
+    Rewrites the input text with the model.
+    Args:
+        model (torch.nn.Module): Model.
+        data (dict): Dictionary containing 'input_ids' and 'attention_mask'.
+    Returns:
+        output (str): Rewritten text.
+    """
+    with torch.no_grad():
+        data = {k: v.to(model.device) for k, v in data.items()}
+        pred = torch.argmax(model(**data).logits, dim=2)
+    output = list() # save the indices of the characters as list of integers
+    # Conversion table for Turkish characters {100: [300, 350], ...}
+    en2tr = {en: tr for tr, en in zip(list(map(list, map(str.encode, list('ÜİĞŞÇÖüığşçö')))), list(map(ord, list('UIGSCOuigsco'))))}
+    for inp, lab in zip((data['input_ids'] - 3)[0].tolist(), pred[0].tolist()):
+        if lab and inp in en2tr:
+            # if the model predicts a diacritic, replace it with the corresponding Turkish character
+            output.extend(en2tr[inp])
+        elif inp >= 0: output.append(inp)
+    return bytes(output).decode()
+def try_it(text, model):#=model):
+    sample = test_mask([text])
+    return rewrite(model, sample[0])
+try_it('Cekoslovakyalilastiramadiklarimizdan misiniz?', model)
+```