asasasaasasa commited on
Commit
dee3f9b
·
verified ·
1 Parent(s): f33f1f9

Upload main.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. main.py +378 -0
main.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py
2
+
3
+ import os
4
+ import streamlit as st
5
+ import logging
6
+ from dotenv import load_dotenv
7
+
8
+ # Load environment variables first, before any other code
9
+ load_dotenv()
10
+
11
+ # Import configuration defaults (after loading .env to prioritize environment variables)
12
+ from config import ENV_DEFAULTS, DEFAULT_CONFIG
13
+
14
+ # Configure logging based on configuration
15
+ log_level = os.environ.get('LOGLEVEL', DEFAULT_CONFIG['LOGLEVEL']).upper()
16
+ logging.basicConfig(
17
+ level=getattr(logging, log_level),
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ # Only log to console if level is INFO or higher
21
+ logging.StreamHandler() if log_level != 'WARNING' else logging.NullHandler()
22
+ ]
23
+ )
24
+
25
+ # Configure app
26
+ st.set_page_config(page_title="Translator & Readability", layout="wide")
27
+
28
+ # Check for missing environment variables and use defaults from config
29
+ for var, default in ENV_DEFAULTS.items():
30
+ if var not in os.environ:
31
+ logging.debug(f"Environment variable {var} not found, using default: {default}")
32
+ os.environ[var] = default
33
+
34
+ # Model configuration from default config (no hardcoded defaults)
35
+ MODEL_CONFIG = {
36
+ "gemma_gpu_layers": DEFAULT_CONFIG["GEMMA_GPU_LAYERS"],
37
+ "gemma_context_size": DEFAULT_CONFIG["GEMMA_CONTEXT_SIZE"],
38
+ "max_parallel_models": DEFAULT_CONFIG["MAX_PARALLEL_MODELS"],
39
+ "session_timeout": DEFAULT_CONFIG["SESSION_TIMEOUT"],
40
+ "allow_gpu": DEFAULT_CONFIG["ALLOW_GPU"]
41
+ }
42
+
43
+ # Initialize model semaphore for limiting concurrent model usage
44
+ import threading
45
+ model_semaphore = threading.Semaphore(MODEL_CONFIG["max_parallel_models"])
46
+
47
+ import tempfile
48
+ import io
49
+ from docx import Document
50
+ import uuid
51
+ import traceback
52
+
53
+ from models.nltk_resources import setup_nltk
54
+ from utils.file_readers import read_file
55
+ from utils.text_processing import detect_language
56
+ from utils.readability_indices import (
57
+ flesch_reading_ease,
58
+ flesch_kincaid_grade_level,
59
+ gunning_fog_index,
60
+ smog_index,
61
+ highlight_complex_text
62
+ )
63
+ from utils.formatting import color_code_index
64
+ from utils.tilmash_translation import tilmash_translate, display_tilmash_streaming_translation
65
+ from utils.gemma_translation import gemma_translate, display_streaming_translation
66
+
67
+ # Initialize session state for user identification
68
+ if 'session_id' not in st.session_state:
69
+ st.session_state.session_id = str(uuid.uuid4())
70
+
71
+ if 'translation_lock' not in st.session_state:
72
+ st.session_state.translation_lock = False
73
+
74
+ def handle_translation():
75
+ st.header("Перевод (Kazakh, Russian, English)")
76
+
77
+ # Show session ID in sidebar for debugging
78
+ with st.sidebar.expander("Session Info", expanded=False):
79
+ st.write(f"Session ID: {st.session_state.session_id}")
80
+
81
+ # Add GPU usage option (admin only)
82
+ if MODEL_CONFIG["allow_gpu"]:
83
+ st.session_state.use_gpu = st.checkbox("Use GPU (faster)", value=False)
84
+ else:
85
+ st.session_state.use_gpu = False
86
+ st.write("GPU mode disabled by configuration")
87
+
88
+ translate_input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
89
+ input_text = ""
90
+
91
+ if translate_input_method == "Загрузить файл":
92
+ uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
93
+ if uploaded_file is not None:
94
+ suffix = os.path.splitext(uploaded_file.name)[1]
95
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
96
+ tmp_file.write(uploaded_file.getbuffer())
97
+ temp_file_path = tmp_file.name
98
+ input_text = read_file(temp_file_path)
99
+ os.remove(temp_file_path)
100
+ st.write("**Содержимое файла:**")
101
+ st.write(input_text)
102
+ else:
103
+ input_text = st.text_area("Вставьте ваш текст здесь", height=200)
104
+
105
+ if input_text:
106
+ auto_detect = st.checkbox("Автоматически определить язык", value=True)
107
+ src_lang = None
108
+ if auto_detect:
109
+ detected_lang = detect_language(input_text)
110
+ if detected_lang in ['ru','en','kk']:
111
+ st.info(f"Определён язык: {detected_lang}")
112
+ src_lang = detected_lang
113
+ else:
114
+ st.warning("Не удалось определить язык. Выберите вручную.")
115
+ src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
116
+ else:
117
+ src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
118
+
119
+ if src_lang == "ru":
120
+ tgt_options = ["en","kk"]
121
+ elif src_lang == "en":
122
+ tgt_options = ["ru","kk"]
123
+ else:
124
+ tgt_options = ["ru","en"]
125
+
126
+ tgt_lang = st.selectbox("Перевод на:", tgt_options)
127
+
128
+ # Select translation model
129
+ model_option = st.radio("Выберите модель перевода:", ["Gemma 3", "Tilmash"])
130
+
131
+ if st.button("Перевести"):
132
+ # Prevent multiple concurrent translations from same session
133
+ if st.session_state.translation_lock:
134
+ st.warning("Перевод уже выполняется. Пожалуйста, дождитесь завершения.")
135
+ return
136
+
137
+ # Set translation lock
138
+ st.session_state.translation_lock = True
139
+
140
+ try:
141
+ # Use the model semaphore to limit concurrent model access
142
+ acquired = model_semaphore.acquire(blocking=False)
143
+ if not acquired:
144
+ st.warning("Максимальное количество параллельных моделей достигнуто. Пожалуйста, попробуйте позже.")
145
+ st.session_state.translation_lock = False
146
+ return
147
+
148
+ try:
149
+ if model_option == "Tilmash":
150
+ st.subheader("Результат перевода:")
151
+ # Get the approximate size of the text to determine if chunking is needed
152
+ approx_text_size = len(input_text) / 4 # rough approximation (4 chars ≈ 1 token)
153
+ needs_chunking = approx_text_size > 500 # If text is likely over 500 tokens
154
+
155
+ # Display appropriate spinner message
156
+ spinner_message = "Processing text in chunks..." if needs_chunking else "Processing translation..."
157
+
158
+ try:
159
+ # Create a dedicated translator instance for this session
160
+ from utils.tilmash_translation import TilmashTranslator
161
+ translator = TilmashTranslator()
162
+
163
+ with st.spinner(spinner_message):
164
+ try:
165
+ # Use direct streaming approach with session-specific translator
166
+ result = ""
167
+ translation_placeholder = st.empty()
168
+
169
+ # Stream translation
170
+ for chunk in translator.translate_streaming(input_text, src_lang, tgt_lang):
171
+ result += chunk
172
+ translation_placeholder.markdown(result)
173
+
174
+ except Exception as e:
175
+ st.error(f"Translation error: {str(e)}")
176
+ logging.error(f"Tilmash translation error: {traceback.format_exc()}")
177
+ result = None
178
+
179
+ if result:
180
+ # Prepare download capability
181
+ doc = Document()
182
+ doc.add_paragraph(result)
183
+ doc_io = io.BytesIO()
184
+ doc.save(doc_io)
185
+ doc_io.seek(0)
186
+
187
+ st.download_button(
188
+ label="Скачать переведённый текст (.docx)",
189
+ data=doc_io,
190
+ file_name="translated_text.docx",
191
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
192
+ )
193
+ else:
194
+ st.warning("Не удалось выполнить перевод.")
195
+
196
+ # Unload Tilmash model after use
197
+ try:
198
+ if translator.initialized:
199
+ translator.unload_model()
200
+ except Exception as unload_error:
201
+ logging.error(f"Error unloading Tilmash model: {str(unload_error)}")
202
+ except Exception as tilmash_error:
203
+ st.error(f"Tilmash model error: {str(tilmash_error)}")
204
+ logging.error(f"Tilmash model error: {traceback.format_exc()}")
205
+ else: # Gemma 3
206
+ st.subheader("Результат перевода (Gemma 3):")
207
+ # Create a translator instance to check if text needs chunking
208
+ from utils.gemma_translation import GemmaTranslator
209
+
210
+ try:
211
+ # Determine if we should use GPU based on user preference and permissions
212
+ gpu_layers = MODEL_CONFIG["gemma_gpu_layers"] if getattr(st.session_state, 'use_gpu', False) else 0
213
+
214
+ # Create a new translator instance for this session
215
+ translator = GemmaTranslator() # Each session gets its own model instance
216
+ if not translator.initialized:
217
+ # Use custom settings from environment
218
+ translator.load_model(
219
+ n_gpu_layers=gpu_layers,
220
+ context_size=MODEL_CONFIG["gemma_context_size"]
221
+ )
222
+ needs_chunking = translator.is_text_too_large(input_text)
223
+
224
+ # Display appropriate spinner message based on whether chunking is needed
225
+ gpu_mode = "GPU" if translator.using_gpu else "CPU"
226
+ spinner_message = f"Processing text in chunks ({gpu_mode} mode)..." if needs_chunking else f"Processing translation ({gpu_mode} mode)..."
227
+
228
+ with st.spinner(spinner_message):
229
+ try:
230
+ # Instead of using the global display_streaming_translation function,
231
+ # use a direct approach that keeps the translator instance
232
+ result = ""
233
+ translation_placeholder = st.empty()
234
+
235
+ # Stream translation tokens and update UI
236
+ for token in translator.translate_streaming(
237
+ input_text, src_lang, tgt_lang,
238
+ temperature=0.1, top_p=0.95
239
+ ):
240
+ result += token
241
+ translation_placeholder.markdown(result)
242
+
243
+ except Exception as e:
244
+ st.error(f"Translation error: {str(e)}")
245
+ logging.error(f"Gemma translation error: {traceback.format_exc()}")
246
+ result = None
247
+
248
+ if result:
249
+ # Prepare download capability
250
+ doc = Document()
251
+ doc.add_paragraph(result)
252
+ doc_io = io.BytesIO()
253
+ doc.save(doc_io)
254
+ doc_io.seek(0)
255
+
256
+ st.download_button(
257
+ label="Скачать переведённый текст (.docx)",
258
+ data=doc_io,
259
+ file_name="gemma_translated_text.docx",
260
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
261
+ )
262
+ else:
263
+ st.warning("Не удалось выполнить перевод с Gemma 3.")
264
+
265
+ # Always unload model after use for better multi-user experience
266
+ translator.unload_model()
267
+ except Exception as model_error:
268
+ st.error(f"Model error: {str(model_error)}")
269
+ logging.error(f"Model error details: {traceback.format_exc()}")
270
+ finally:
271
+ # Release the semaphore
272
+ model_semaphore.release()
273
+ except Exception as outer_error:
274
+ st.error(f"Unexpected error: {str(outer_error)}")
275
+ logging.error(f"Unexpected error: {traceback.format_exc()}")
276
+ finally:
277
+ # Release translation lock
278
+ st.session_state.translation_lock = False
279
+
280
+ def handle_readability_analysis():
281
+ st.header("Анализ удобочитаемости текста")
282
+ input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
283
+ text = ""
284
+
285
+ if input_method == "Загрузить файл":
286
+ uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
287
+ if uploaded_file is not None:
288
+ suffix = os.path.splitext(uploaded_file.name)[1]
289
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
290
+ tmp_file.write(uploaded_file.getbuffer())
291
+ temp_file_path = tmp_file.name
292
+ text = read_file(temp_file_path)
293
+ os.remove(temp_file_path)
294
+ st.write("**Содержимое файла:**")
295
+ st.write(text)
296
+ else:
297
+ text = st.text_area("Вставьте ваш текст здесь", height=200)
298
+
299
+ if text:
300
+ auto_detect = st.checkbox("Определить язык автоматически", value=True)
301
+ if auto_detect:
302
+ detected_lang = detect_language(text)
303
+ st.info(f"Определён язык: {detected_lang}")
304
+ lang_code = detected_lang if detected_lang in ['ru','en','kk'] else 'en'
305
+ else:
306
+ lang_code = st.selectbox("Язык текста", ["ru", "en", "kk"])
307
+
308
+ if st.button("Анализировать"):
309
+ # Prevent multiple concurrent analyses
310
+ if 'analysis_lock' in st.session_state and st.session_state.analysis_lock:
311
+ st.warning("Анализ уже выполняется. Пожалуйста, дождитесь завершения.")
312
+ return
313
+
314
+ # Set analysis lock
315
+ st.session_state.analysis_lock = True
316
+
317
+ try:
318
+ # Use the model semaphore for consistency with translation
319
+ acquired = model_semaphore.acquire(blocking=False)
320
+ if not acquired:
321
+ st.warning("Система загружена. Пожалуйста, попробуйте позже.")
322
+ st.session_state.analysis_lock = False
323
+ return
324
+
325
+ try:
326
+ with st.spinner("Выполняется анализ..."):
327
+ fre = flesch_reading_ease(text, lang_code)
328
+ fkgl = flesch_kincaid_grade_level(text, lang_code)
329
+ fog = gunning_fog_index(text, lang_code)
330
+ smog = smog_index(text, lang_code)
331
+ highlighted_text, complex_words_list = highlight_complex_text(text, lang_code)
332
+
333
+ st.subheader("Результаты удобочитаемости")
334
+ st.markdown(
335
+ f"**Индекс удобочитаемости Флеша:** {color_code_index('Flesch Reading Ease', fre)}",
336
+ unsafe_allow_html=True
337
+ )
338
+ st.markdown(
339
+ f"**Индекс Флеша-Кинкейда:** {color_code_index('Flesch-Kincaid Grade Level', fkgl)}",
340
+ unsafe_allow_html=True
341
+ )
342
+ st.markdown(
343
+ f"**Индекс тумана Ганнинга:** {color_code_index('Gunning Fog Index', fog)}",
344
+ unsafe_allow_html=True
345
+ )
346
+ st.markdown(
347
+ f"**Индекс SMOG:** {color_code_index('SMOG Index', smog)}",
348
+ unsafe_allow_html=True
349
+ )
350
+
351
+ st.subheader("Сложные слова")
352
+ st.write(", ".join(set(complex_words_list)))
353
+ finally:
354
+ # Release the semaphore
355
+ model_semaphore.release()
356
+ finally:
357
+ # Release analysis lock
358
+ st.session_state.analysis_lock = False
359
+
360
+ def main():
361
+ setup_nltk()
362
+
363
+ # Log the model configuration only once per session
364
+ if 'model_config_logged' not in st.session_state:
365
+ logging.info(f"Using model configuration: {MODEL_CONFIG}")
366
+ st.session_state.model_config_logged = True
367
+
368
+ st.title("Translation & Readability Analysis")
369
+ st.sidebar.header("Функциональность")
370
+ functionality = st.sidebar.radio("Выберите режим:", ["Перевод", "Анализ удобочитаемости"])
371
+
372
+ if functionality == "Перевод":
373
+ handle_translation()
374
+ elif functionality == "Анализ удобочитаемости":
375
+ handle_readability_analysis()
376
+
377
+ if __name__ == "__main__":
378
+ main()