Spaces:

asasasaasasa
/

tilmash-gemma3-translator

Build error

App Files Files Community

asasasaasasa commited on Aug 10

Commit

dee3f9b

verified ·

1 Parent(s): f33f1f9

Upload main.py with huggingface_hub

Browse files

Files changed (1) hide show

main.py +378 -0

main.py ADDED Viewed

	@@ -0,0 +1,378 @@

+# main.py
+import os
+import streamlit as st
+import logging
+from dotenv import load_dotenv
+# Load environment variables first, before any other code
+load_dotenv()
+# Import configuration defaults (after loading .env to prioritize environment variables)
+from config import ENV_DEFAULTS, DEFAULT_CONFIG
+# Configure logging based on configuration
+log_level = os.environ.get('LOGLEVEL', DEFAULT_CONFIG['LOGLEVEL']).upper()
+logging.basicConfig(
+    level=getattr(logging, log_level),
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        # Only log to console if level is INFO or higher
+        logging.StreamHandler() if log_level != 'WARNING' else logging.NullHandler()
+    ]
+)
+# Configure app
+st.set_page_config(page_title="Translator & Readability", layout="wide")
+# Check for missing environment variables and use defaults from config
+for var, default in ENV_DEFAULTS.items():
+    if var not in os.environ:
+        logging.debug(f"Environment variable {var} not found, using default: {default}")
+        os.environ[var] = default
+# Model configuration from default config (no hardcoded defaults)
+MODEL_CONFIG = {
+    "gemma_gpu_layers": DEFAULT_CONFIG["GEMMA_GPU_LAYERS"],
+    "gemma_context_size": DEFAULT_CONFIG["GEMMA_CONTEXT_SIZE"],
+    "max_parallel_models": DEFAULT_CONFIG["MAX_PARALLEL_MODELS"],
+    "session_timeout": DEFAULT_CONFIG["SESSION_TIMEOUT"],
+    "allow_gpu": DEFAULT_CONFIG["ALLOW_GPU"]
+}
+# Initialize model semaphore for limiting concurrent model usage
+import threading
+model_semaphore = threading.Semaphore(MODEL_CONFIG["max_parallel_models"])
+import tempfile
+import io
+from docx import Document
+import uuid
+import traceback
+from models.nltk_resources import setup_nltk
+from utils.file_readers import read_file
+from utils.text_processing import detect_language
+from utils.readability_indices import (
+    flesch_reading_ease,
+    flesch_kincaid_grade_level,
+    gunning_fog_index,
+    smog_index,
+    highlight_complex_text
+)
+from utils.formatting import color_code_index
+from utils.tilmash_translation import tilmash_translate, display_tilmash_streaming_translation
+from utils.gemma_translation import gemma_translate, display_streaming_translation
+# Initialize session state for user identification
+if 'session_id' not in st.session_state:
+    st.session_state.session_id = str(uuid.uuid4())
+if 'translation_lock' not in st.session_state:
+    st.session_state.translation_lock = False
+def handle_translation():
+    st.header("Перевод (Kazakh, Russian, English)")
+    # Show session ID in sidebar for debugging
+    with st.sidebar.expander("Session Info", expanded=False):
+        st.write(f"Session ID: {st.session_state.session_id}")
+        # Add GPU usage option (admin only)
+        if MODEL_CONFIG["allow_gpu"]:
+            st.session_state.use_gpu = st.checkbox("Use GPU (faster)", value=False)
+        else:
+            st.session_state.use_gpu = False
+            st.write("GPU mode disabled by configuration")
+    translate_input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
+    input_text = ""
+    if translate_input_method == "Загрузить файл":
+        uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
+        if uploaded_file is not None:
+            suffix = os.path.splitext(uploaded_file.name)[1]
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+                tmp_file.write(uploaded_file.getbuffer())
+                temp_file_path = tmp_file.name
+            input_text = read_file(temp_file_path)
+            os.remove(temp_file_path)
+            st.write("**Содержимое файла:**")
+            st.write(input_text)
+    else:
+        input_text = st.text_area("Вставьте ваш текст здесь", height=200)
+    if input_text:
+        auto_detect = st.checkbox("Автоматически определить язык", value=True)
+        src_lang = None
+        if auto_detect:
+            detected_lang = detect_language(input_text)
+            if detected_lang in ['ru','en','kk']:
+                st.info(f"Определён язык: {detected_lang}")
+                src_lang = detected_lang
+            else:
+                st.warning("Не удалось определить язык. Выберите вручную.")
+                src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
+        else:
+            src_lang = st.selectbox("Язык текста", ["ru", "en", "kk"])
+        if src_lang == "ru":
+            tgt_options = ["en","kk"]
+        elif src_lang == "en":
+            tgt_options = ["ru","kk"]
+        else:
+            tgt_options = ["ru","en"]
+        tgt_lang = st.selectbox("Перевод на:", tgt_options)
+        # Select translation model
+        model_option = st.radio("Выберите модель перевода:", ["Gemma 3", "Tilmash"])
+        if st.button("Перевести"):
+            # Prevent multiple concurrent translations from same session
+            if st.session_state.translation_lock:
+                st.warning("Перевод уже выполняется. Пожалуйста, дождитесь завершения.")
+                return
+            # Set translation lock
+            st.session_state.translation_lock = True
+            try:
+                # Use the model semaphore to limit concurrent model access
+                acquired = model_semaphore.acquire(blocking=False)
+                if not acquired:
+                    st.warning("Максимальное количество параллельных моделей достигнуто. Пожалуйста, попробуйте позже.")
+                    st.session_state.translation_lock = False
+                    return
+                try:
+                    if model_option == "Tilmash":
+                        st.subheader("Результат перевода:")
+                        # Get the approximate size of the text to determine if chunking is needed
+                        approx_text_size = len(input_text) / 4  # rough approximation (4 chars ≈ 1 token)
+                        needs_chunking = approx_text_size > 500  # If text is likely over 500 tokens
+                        # Display appropriate spinner message
+                        spinner_message = "Processing text in chunks..." if needs_chunking else "Processing translation..."
+                        try:
+                            # Create a dedicated translator instance for this session
+                            from utils.tilmash_translation import TilmashTranslator
+                            translator = TilmashTranslator()
+                            with st.spinner(spinner_message):
+                                try:
+                                    # Use direct streaming approach with session-specific translator
+                                    result = ""
+                                    translation_placeholder = st.empty()
+                                    # Stream translation
+                                    for chunk in translator.translate_streaming(input_text, src_lang, tgt_lang):
+                                        result += chunk
+                                        translation_placeholder.markdown(result)
+                                except Exception as e:
+                                    st.error(f"Translation error: {str(e)}")
+                                    logging.error(f"Tilmash translation error: {traceback.format_exc()}")
+                                    result = None
+                            if result:
+                                # Prepare download capability
+                                doc = Document()
+                                doc.add_paragraph(result)
+                                doc_io = io.BytesIO()
+                                doc.save(doc_io)
+                                doc_io.seek(0)
+                                st.download_button(
+                                    label="Скачать переведённый текст (.docx)",
+                                    data=doc_io,
+                                    file_name="translated_text.docx",
+                                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                                )
+                            else:
+                                st.warning("Не удалось выполнить перевод.")
+                            # Unload Tilmash model after use
+                            try:
+                                if translator.initialized:
+                                    translator.unload_model()
+                            except Exception as unload_error:
+                                logging.error(f"Error unloading Tilmash model: {str(unload_error)}")
+                        except Exception as tilmash_error:
+                            st.error(f"Tilmash model error: {str(tilmash_error)}")
+                            logging.error(f"Tilmash model error: {traceback.format_exc()}")
+                    else:  # Gemma 3
+                        st.subheader("Результат перевода (Gemma 3):")
+                        # Create a translator instance to check if text needs chunking
+                        from utils.gemma_translation import GemmaTranslator
+                        try:
+                            # Determine if we should use GPU based on user preference and permissions
+                            gpu_layers = MODEL_CONFIG["gemma_gpu_layers"] if getattr(st.session_state, 'use_gpu', False) else 0
+                            # Create a new translator instance for this session
+                            translator = GemmaTranslator()  # Each session gets its own model instance
+                            if not translator.initialized:
+                                # Use custom settings from environment
+                                translator.load_model(
+                                    n_gpu_layers=gpu_layers,
+                                    context_size=MODEL_CONFIG["gemma_context_size"]
+                                )
+                            needs_chunking = translator.is_text_too_large(input_text)
+                            # Display appropriate spinner message based on whether chunking is needed
+                            gpu_mode = "GPU" if translator.using_gpu else "CPU"
+                            spinner_message = f"Processing text in chunks ({gpu_mode} mode)..." if needs_chunking else f"Processing translation ({gpu_mode} mode)..."
+                            with st.spinner(spinner_message):
+                                try:
+                                    # Instead of using the global display_streaming_translation function,
+                                    # use a direct approach that keeps the translator instance
+                                    result = ""
+                                    translation_placeholder = st.empty()
+                                    # Stream translation tokens and update UI
+                                    for token in translator.translate_streaming(
+                                        input_text, src_lang, tgt_lang,
+                                        temperature=0.1, top_p=0.95
+                                    ):
+                                        result += token
+                                        translation_placeholder.markdown(result)
+                                except Exception as e:
+                                    st.error(f"Translation error: {str(e)}")
+                                    logging.error(f"Gemma translation error: {traceback.format_exc()}")
+                                    result = None
+                                if result:
+                                    # Prepare download capability
+                                    doc = Document()
+                                    doc.add_paragraph(result)
+                                    doc_io = io.BytesIO()
+                                    doc.save(doc_io)
+                                    doc_io.seek(0)
+                                    st.download_button(
+                                        label="Скачать переведённый текст (.docx)",
+                                        data=doc_io,
+                                        file_name="gemma_translated_text.docx",
+                                        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                                    )
+                                else:
+                                    st.warning("Не удалось выполнить перевод с Gemma 3.")
+                                # Always unload model after use for better multi-user experience
+                                translator.unload_model()
+                        except Exception as model_error:
+                            st.error(f"Model error: {str(model_error)}")
+                            logging.error(f"Model error details: {traceback.format_exc()}")
+                finally:
+                    # Release the semaphore
+                    model_semaphore.release()
+            except Exception as outer_error:
+                st.error(f"Unexpected error: {str(outer_error)}")
+                logging.error(f"Unexpected error: {traceback.format_exc()}")
+            finally:
+                # Release translation lock
+                st.session_state.translation_lock = False
+def handle_readability_analysis():
+    st.header("Анализ удобочитаемости текста")
+    input_method = st.radio("Способ ввода текста:", ["Загрузить файл", "Вставить текст"])
+    text = ""
+    if input_method == "Загрузить файл":
+        uploaded_file = st.file_uploader("Выберите файл (.txt, .docx, .pdf)", type=["txt", "docx", "pdf"])
+        if uploaded_file is not None:
+            suffix = os.path.splitext(uploaded_file.name)[1]
+            with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
+                tmp_file.write(uploaded_file.getbuffer())
+                temp_file_path = tmp_file.name
+            text = read_file(temp_file_path)
+            os.remove(temp_file_path)
+            st.write("**Содержимое файла:**")
+            st.write(text)
+    else:
+        text = st.text_area("Вставьте ваш текст здесь", height=200)
+    if text:
+        auto_detect = st.checkbox("Определить язык автоматически", value=True)
+        if auto_detect:
+            detected_lang = detect_language(text)
+            st.info(f"Определён язык: {detected_lang}")
+            lang_code = detected_lang if detected_lang in ['ru','en','kk'] else 'en'
+        else:
+            lang_code = st.selectbox("Язык текста", ["ru", "en", "kk"])
+        if st.button("Анализировать"):
+            # Prevent multiple concurrent analyses
+            if 'analysis_lock' in st.session_state and st.session_state.analysis_lock:
+                st.warning("Анализ уже выполняется. Пожалуйста, дождитесь завершения.")
+                return
+            # Set analysis lock
+            st.session_state.analysis_lock = True
+            try:
+                # Use the model semaphore for consistency with translation
+                acquired = model_semaphore.acquire(blocking=False)
+                if not acquired:
+                    st.warning("Система загружена. Пожалуйста, попробуйте позже.")
+                    st.session_state.analysis_lock = False
+                    return
+                try:
+                    with st.spinner("Выполняется анализ..."):
+                        fre = flesch_reading_ease(text, lang_code)
+                        fkgl = flesch_kincaid_grade_level(text, lang_code)
+                        fog = gunning_fog_index(text, lang_code)
+                        smog = smog_index(text, lang_code)
+                        highlighted_text, complex_words_list = highlight_complex_text(text, lang_code)
+                    st.subheader("Результаты удобочитаемости")
+                    st.markdown(
+                        f"**Индекс удобочитаемости Флеша:** {color_code_index('Flesch Reading Ease', fre)}",
+                        unsafe_allow_html=True
+                    )
+                    st.markdown(
+                        f"**Индекс Флеша-Кинкейда:** {color_code_index('Flesch-Kincaid Grade Level', fkgl)}",
+                        unsafe_allow_html=True
+                    )
+                    st.markdown(
+                        f"**Индекс тумана Ганнинга:** {color_code_index('Gunning Fog Index', fog)}",
+                        unsafe_allow_html=True
+                    )
+                    st.markdown(
+                        f"**Индекс SMOG:** {color_code_index('SMOG Index', smog)}",
+                        unsafe_allow_html=True
+                    )
+                    st.subheader("Сложные слова")
+                    st.write(", ".join(set(complex_words_list)))
+                finally:
+                    # Release the semaphore
+                    model_semaphore.release()
+            finally:
+                # Release analysis lock
+                st.session_state.analysis_lock = False
+def main():
+    setup_nltk()
+    # Log the model configuration only once per session
+    if 'model_config_logged' not in st.session_state:
+        logging.info(f"Using model configuration: {MODEL_CONFIG}")
+        st.session_state.model_config_logged = True
+    st.title("Translation & Readability Analysis")
+    st.sidebar.header("Функциональность")
+    functionality = st.sidebar.radio("Выберите режим:", ["Перевод", "Анализ удобочитаемости"])
+    if functionality == "Перевод":
+        handle_translation()
+    elif functionality == "Анализ удобочитаемости":
+        handle_readability_analysis()
+if __name__ == "__main__":
+    main()