TiberiuCristianLeon commited on
Commit
6ee7af6
·
verified ·
1 Parent(s): a68ab45

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -6
app.py CHANGED
@@ -6,7 +6,336 @@ import httpx
6
 
7
  logging.set_verbosity_error()
8
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def download_argos_model(from_code, to_code):
11
  import argostranslate.package
12
  print('Downloading model', from_code, to_code)
@@ -58,12 +387,6 @@ def wingpt(model_name, sl, tl, input_text):
58
  st.header("Text Machine Translation")
59
  input_text = st.text_input("Enter text to translate:")
60
 
61
- # Language options and mappings
62
- options = ["German", "Romanian", "English", "French", "Spanish", "Italian"]
63
- langs = {"English": "en", "Romanian": "ro", "German": "de", "French": "fr", "Spanish": "es", "Italian": "it"}
64
- models = ["Helsinki-NLP", "Argos", "t5-base", "t5-small", "t5-large", "Unbabel/Tower-Plus-2B",
65
- "Unbabel/TowerInstruct-Mistral-7B-v0.2", "winninghealth/WiNGPT-Babel-2", "Google"]
66
-
67
  # Initialize session state if not already set
68
  if "sselected_language" not in st.session_state:
69
  st.session_state["sselected_language"] = options[0]
 
6
 
7
  logging.set_verbosity_error()
8
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
9
+ # Language options and mappings
10
+ options = ["German", "Romanian", "English", "French", "Spanish", "Italian"]
11
+ langs = {"English": "en", "Romanian": "ro", "German": "de", "French": "fr", "Spanish": "es", "Italian": "it"}
12
+ models = ["Helsinki-NLP", "Argos", "t5-base", "t5-small", "t5-large", "Unbabel/Tower-Plus-2B",
13
+ "Unbabel/TowerInstruct-Mistral-7B-v0.2", "winninghealth/WiNGPT-Babel-2", "Google"]
14
+
15
+ class Translators:
16
+ def __init__(self, model_name: str, sl: str, tl: str, input_text: str):
17
+ self.model_name = model_name
18
+ self.sl, self.tl = sl, tl
19
+ self.input_text = input_text
20
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
21
+
22
+ def google(self):
23
+ url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}'
24
+ response = requests.get(url)
25
+ return response.json()[0][0][0]
26
+
27
+ @classmethod
28
+ def download_argos_model(cls, from_code, to_code):
29
+ import argostranslate.package
30
+ print('Downloading model', from_code, to_code)
31
+ # Download and install Argos Translate package
32
+ argostranslate.package.update_package_index()
33
+ available_packages = argostranslate.package.get_available_packages()
34
+ package_to_install = next(
35
+ filter(lambda x: x.from_code == from_code and x.to_code == to_code, available_packages)
36
+ )
37
+ argostranslate.package.install_from_path(package_to_install.download())
38
+
39
+ def argos(self):
40
+ import argostranslate.translate, argostranslate.package
41
+ try:
42
+ Translators.download_argos_model(self.sl, self.tl) # Download model
43
+ translated_text = argostranslate.translate.translate(self.input_text, self.sl, self.tl) # Translate
44
+ except StopIteration:
45
+ # packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages())
46
+ packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages())
47
+ translated_text = f"No Argos model for {self.sl} to {self.tl}. Try other model or languages combination from the available Argos models: {packages_info}."
48
+ except Exception as error:
49
+ translated_text = error
50
+ return translated_text
51
+
52
+ def HelsinkiNLP_mulroa(self):
53
+ try:
54
+ pipe = pipeline("translation", model=self.model_name, device=self.device)
55
+ iso1to3 = {iso[1]: iso[3] for iso in non_empty_isos} # {'ro': 'ron'}
56
+ iso3tl = iso1to3.get(self.tl) # 'deu', 'ron', 'eng', 'fra'
57
+ translation = pipe(f'>>{iso3tl}<< {self.input_text}')
58
+ return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.'
59
+ except Exception as error:
60
+ return f"Error translating with model: {self.model_name}! Try other available language combination.", error
61
+
62
+ def HelsinkiNLP(self):
63
+ try: # Standard bilingual model
64
+ model_name = f"Helsinki-NLP/opus-mt-{self.sl}-{self.tl}"
65
+ pipe = pipeline("translation", model=model_name, device=self.device)
66
+ translation = pipe(self.input_text)
67
+ return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.'
68
+ except EnvironmentError:
69
+ try: # Tatoeba models
70
+ model_name = f"Helsinki-NLP/opus-tatoeba-{self.sl}-{self.tl}"
71
+ pipe = pipeline("translation", model=model_name, device=self.device)
72
+ translation = pipe(self.input_text)
73
+ return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.'
74
+ except EnvironmentError as error:
75
+ self.model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul" # Last resort: try multi to multi
76
+ return self.HelsinkiNLP_mulroa()
77
+ except KeyError as error:
78
+ return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error
79
+
80
+ def LLaMAX(self):
81
+ pipe = pipeline("text-generation", model="LLaMAX/LLaMAX3-8B")
82
+ messages = [
83
+ {"role": "user", "content": f"Translate the following text from {self.sl} to {self.sl}: {self.input_text}"},
84
+ ]
85
+ return pipe(messages)[0]["generated_text"]
86
+
87
+ def LegoMT(self):
88
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
89
+ model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT"
90
+ tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
91
+ tokenizer.src_lang = self.sl
92
+ encoded = tokenizer(self.input_text, return_tensors="pt")
93
+ generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
94
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
95
+
96
+ def madlad(self):
97
+ model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto")
98
+ tokenizer = T5Tokenizer.from_pretrained(self.model_name)
99
+ text = f"<2{self.tl}> {self.input_text}"
100
+ # input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
101
+ # outputs = model.generate(input_ids=input_ids)
102
+ # return tokenizer.decode(outputs[0], skip_special_tokens=True)
103
+ # Use a pipeline as a high-level helper
104
+ translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
105
+ translated_text = translator(text, max_length=512)
106
+ return translated_text[0]['translation_text']
107
+
108
+ def smollm(self):
109
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
110
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
111
+ prompt = f"""Translate the following {self.sl} text to {self.tl}, generating only the translated text and maintaining the original meaning and tone:
112
+ {self.input_text}
113
+ Translation:"""
114
+ inputs = tokenizer(prompt, return_tensors="pt")
115
+ outputs = model.generate(
116
+ inputs.input_ids,
117
+ max_length=len(inputs.input_ids[0]) + 150,
118
+ temperature=0.3,
119
+ do_sample=True
120
+ )
121
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
122
+ print(response)
123
+ return response.split("Translation:")[-1].strip()
124
+
125
+ def flan(self):
126
+ tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False)
127
+ model = T5ForConditionalGeneration.from_pretrained(self.model_name)
128
+ prompt = f"translate {self.sl} to {self.tl}: {self.input_text}"
129
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
130
+ outputs = model.generate(input_ids)
131
+ return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
132
+
133
+ def tfive(self):
134
+ tokenizer = T5Tokenizer.from_pretrained(self.model_name)
135
+ model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto")
136
+ prompt = f"translate {self.sl} to {self.tl}: {self.input_text}"
137
+ input_ids = tokenizer.encode(prompt, return_tensors="pt")
138
+ output_ids = model.generate(input_ids, max_length=512)
139
+ translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
140
+ return translated_text
141
+
142
+ def mbart_many_to_many(self):
143
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
144
+ model = MBartForConditionalGeneration.from_pretrained(self.model_name)
145
+ tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name)
146
+ # translate source to target
147
+ tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl]
148
+ encoded = tokenizer(self.input_text, return_tensors="pt")
149
+ generated_tokens = model.generate(
150
+ **encoded,
151
+ forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[self.tl]]
152
+ )
153
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
154
+
155
+ def mbart_one_to_many(self):
156
+ # translate from English
157
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
158
+ model = MBartForConditionalGeneration.from_pretrained(self.model_name)
159
+ tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name, src_lang="en_XX")
160
+ model_inputs = tokenizer(self.input_text, return_tensors="pt")
161
+ langid = languagecodes.mbart_large_languages[self.tl]
162
+ generated_tokens = model.generate(
163
+ **model_inputs,
164
+ forced_bos_token_id=tokenizer.lang_code_to_id[langid]
165
+ )
166
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
167
+
168
+ def mbart_many_to_one(self):
169
+ # translate to English
170
+ from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
171
+ model = MBartForConditionalGeneration.from_pretrained(self.model_name)
172
+ tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name)
173
+ tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl]
174
+ encoded = tokenizer(self.input_text, return_tensors="pt")
175
+ generated_tokens = model.generate(**encoded)
176
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
177
+
178
+ def mtom(self):
179
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
180
+ model = M2M100ForConditionalGeneration.from_pretrained(self.model_name)
181
+ tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
182
+ tokenizer.src_lang = self.sl
183
+ encoded = tokenizer(self.input_text, return_tensors="pt")
184
+ generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl))
185
+ return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
186
+
187
+ def bigscience(self):
188
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
189
+ model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
190
+ self.input_text = self.input_text if self.input_text.endswith('.') else f'{self.input_text}.'
191
+ inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}", return_tensors="pt")
192
+ outputs = model.generate(inputs)
193
+ translation = tokenizer.decode(outputs[0])
194
+ translation = translation.replace('<pad> ', '').replace('</s>', '')
195
+ return translation
196
+
197
+ def bloomz(self):
198
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
199
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
200
+ self.input_text = self.input_text if self.input_text.endswith('.') else f'{self.input_text}.'
201
+ # inputs = tokenizer.encode(f"Translate from {self.sl} to {self.tl}: {self.input_text} Translation:", return_tensors="pt")
202
+ inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}", return_tensors="pt")
203
+ outputs = model.generate(inputs)
204
+ translation = tokenizer.decode(outputs[0])
205
+ translation = translation.replace('<pad> ', '').replace('</s>', '')
206
+ translation = translation.split('Translation:')[-1].strip() if 'Translation:' in translation else translation.strip()
207
+ return translation
208
+
209
+ def nllb(self):
210
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name, src_lang=self.sl)
211
+ # model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, device_map="auto", torch_dtype=torch.bfloat16)
212
+ model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)
213
+ translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl)
214
+ translated_text = translator(self.input_text, max_length=512)
215
+ return translated_text[0]['translation_text']
216
+
217
+ def wingpt(self):
218
+ model = AutoModelForCausalLM.from_pretrained(
219
+ self.model_name,
220
+ torch_dtype="auto",
221
+ device_map="auto"
222
+ )
223
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
224
+ # input_json = '{"input_text": self.input_text}'
225
+ messages = [
226
+ {"role": "system", "content": f"Translate this to {self.tl} language"},
227
+ {"role": "user", "content": self.input_text}
228
+ ]
229
+
230
+ text = tokenizer.apply_chat_template(
231
+ messages,
232
+ tokenize=False,
233
+ add_generation_prompt=True
234
+ )
235
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
236
+
237
+ generated_ids = model.generate(
238
+ **model_inputs,
239
+ max_new_tokens=512,
240
+ temperature=0.1
241
+ )
242
+
243
+ generated_ids = [
244
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
245
+ ]
246
+ print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True))
247
+ output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
248
+ result = output.split('\n')[-1].strip() if '\n' in output else output.strip()
249
+ return result
250
+
251
+ def eurollm(self):
252
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
253
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
254
+ prompt = f"{self.sl}: {self.input_text} {self.tl}:"
255
+ inputs = tokenizer(prompt, return_tensors="pt")
256
+ outputs = model.generate(**inputs, max_new_tokens=512)
257
+ output = tokenizer.decode(outputs[0], skip_special_tokens=True)
258
+ print(output)
259
+ # result = output.rsplit(f'{self.tl}:')[-1].strip() if f'{self.tl}:' in output else output.strip()
260
+ result = output.rsplit(f'{self.tl}:')[-1].strip() if '\n' in output or f'{self.tl}:' in output else output.strip()
261
+ return result
262
+
263
+ def eurollm_instruct(self):
264
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
265
+ model = AutoModelForCausalLM.from_pretrained(self.model_name)
266
+ text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {self.sl} source text to {self.tl}:\n{self.sl}: {self.input_text} \n{self.tl}: <|im_end|>\n<|im_start|>assistant\n'
267
+ inputs = tokenizer(text, return_tensors="pt")
268
+ outputs = model.generate(**inputs, max_new_tokens=512)
269
+ output = tokenizer.decode(outputs[0], skip_special_tokens=True)
270
+ if f'{self.tl}:' in output:
271
+ output = output.rsplit(f'{self.tl}:')[-1].strip().replace('assistant\n', '').strip()
272
+ return output
273
+
274
+ def teuken(self):
275
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
276
+ model = AutoModelForCausalLM.from_pretrained(
277
+ self.model_name,
278
+ trust_remote_code=True,
279
+ torch_dtype=torch.bfloat16,
280
+ )
281
+ model = model.to(device).eval()
282
+ tokenizer = AutoTokenizer.from_pretrained(
283
+ self.model_name,
284
+ use_fast=False,
285
+ trust_remote_code=True,
286
+ )
287
+ translation_prompt = f"Translate the following text from {self.sl} into {self.tl}: {self.input_text}"
288
+ messages = [{"role": "User", "content": translation_prompt}]
289
+ prompt_ids = tokenizer.apply_chat_template(messages, chat_template="EN", tokenize=True, add_generation_prompt=False, return_tensors="pt")
290
+ prediction = model.generate(
291
+ prompt_ids.to(model.device),
292
+ max_length=512,
293
+ do_sample=True,
294
+ top_k=50,
295
+ top_p=0.95,
296
+ temperature=0.7,
297
+ num_return_sequences=1,
298
+ )
299
+ translation = tokenizer.decode(prediction[0].tolist())
300
+ return translation
301
+
302
+ def unbabel(self):
303
+ pipe = pipeline("text-generation", model=self.model_name, torch_dtype=torch.bfloat16, device_map="auto")
304
+ messages = [{"role": "user",
305
+ "content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text}.\n{self.tl}:"}]
306
+ prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
307
+ tokenized_input = pipe.tokenizer(self.input_text, return_tensors="pt")
308
+ num_input_tokens = len(tokenized_input["input_ids"][0])
309
+ max_new_tokens = round(num_input_tokens + 0.25 * num_input_tokens)
310
+ outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False)
311
+ translated_text = outputs[0]["generated_text"]
312
+ print(f"Input chars: {len(input_text)}", f"Input tokens: {num_input_tokens}", f"max_new_tokens: {max_new_tokens}",
313
+ "Chars to tokens ratio:", round(len(input_text) / num_input_tokens, 2), f"Raw translation: {translated_text}")
314
+ markers = ["<end_of_turn>", "<|im_end|>", "<|im_start|>assistant"] # , "\n"
315
+ for marker in markers:
316
+ if marker in translated_text:
317
+ translated_text = translated_text.split(marker)[1].strip()
318
+ translated_text = translated_text.replace('Answer:', '', 1).strip() if translated_text.startswith('Answer:') else translated_text
319
+ translated_text = translated_text.split("Translated text:")[0].strip() if "Translated text:" in translated_text else translated_text
320
+ split_translated_text = translated_text.split('\n', translated_text.count('\n'))
321
+ translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1])
322
+ return translated_text
323
 
324
+ def bergamot(model_name: str = 'deen', sl: str = 'de', tl: str = 'en', input_text: str = 'Hallo, mein Freund'):
325
+ try:
326
+ import bergamot
327
+ # input_text = [input_text] if isinstance(input_text, str) else input_text
328
+ config = bergamot.ServiceConfig(numWorkers=4)
329
+ service = bergamot.Service(config)
330
+ model = service.modelFromConfigPath(f"./{model_name}/bergamot.config.yml")
331
+ options = bergamot.ResponseOptions(alignment=False, qualityScores=False, HTML=False)
332
+ rawresponse = service.translate(model, bergamot.VectorString(input_text), options)
333
+ translated_text: str = next(iter(rawresponse)).target.text
334
+ message_text = f"Translated from {sl} to {tl} with Bergamot {model_name}."
335
+ except Exception as error:
336
+ response = error
337
+ return translated_text, message_text
338
+
339
  def download_argos_model(from_code, to_code):
340
  import argostranslate.package
341
  print('Downloading model', from_code, to_code)
 
387
  st.header("Text Machine Translation")
388
  input_text = st.text_input("Enter text to translate:")
389
 
 
 
 
 
 
 
390
  # Initialize session state if not already set
391
  if "sselected_language" not in st.session_state:
392
  st.session_state["sselected_language"] = options[0]