Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from transformers import AutoTokenizer
|
| 3 |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
|
| 4 |
from huggingface_hub import login
|
| 5 |
from PyPDF2 import PdfReader
|
|
@@ -7,6 +7,7 @@ from docx import Document
|
|
| 7 |
import csv
|
| 8 |
import json
|
| 9 |
import os
|
|
|
|
| 10 |
|
| 11 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
| 12 |
|
|
@@ -27,17 +28,57 @@ def load_llm():
|
|
| 27 |
|
| 28 |
llm_engine_hf, tokenizer = load_llm()
|
| 29 |
|
| 30 |
-
|
| 31 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
def
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
def handle_uploaded_file(uploaded_file):
|
| 43 |
try:
|
|
@@ -65,9 +106,23 @@ def handle_uploaded_file(uploaded_file):
|
|
| 65 |
except Exception as e:
|
| 66 |
return str(e)
|
| 67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
# Entrada del usuario
|
| 69 |
user_input = st.text_input("T煤: ", "")
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
# Manejo de archivos subidos
|
| 72 |
uploaded_files = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True)
|
| 73 |
|
|
@@ -76,13 +131,28 @@ if st.button("Enviar"):
|
|
| 76 |
response = generate_response(user_input)
|
| 77 |
st.session_state.generated.append({"user": user_input, "bot": response})
|
| 78 |
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
for chat in st.session_state["generated"]:
|
| 81 |
st.write(f"T煤: {chat['user']}")
|
| 82 |
-
st.write(f"Chatbot: {chat['bot']}")
|
| 83 |
-
|
| 84 |
-
if uploaded_files:
|
| 85 |
-
for uploaded_file in uploaded_files:
|
| 86 |
-
st.write(f"Archivo subido: {uploaded_file.name}")
|
| 87 |
-
file_content = handle_uploaded_file(uploaded_file)
|
| 88 |
-
st.write(file_content)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 3 |
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
|
| 4 |
from huggingface_hub import login
|
| 5 |
from PyPDF2 import PdfReader
|
|
|
|
| 7 |
import csv
|
| 8 |
import json
|
| 9 |
import os
|
| 10 |
+
import torch
|
| 11 |
|
| 12 |
huggingface_token = os.getenv('HUGGINGFACE_TOKEN')
|
| 13 |
|
|
|
|
| 28 |
|
| 29 |
llm_engine_hf, tokenizer = load_llm()
|
| 30 |
|
| 31 |
+
# Configuraci贸n del modelo de clasificaci贸n
|
| 32 |
+
@st.cache_resource
|
| 33 |
+
def load_classification_model():
|
| 34 |
+
tokenizer = AutoTokenizer.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
|
| 35 |
+
model = AutoModelForSequenceClassification.from_pretrained("mrm8488/legal-longformer-base-8192-spanish")
|
| 36 |
+
return model, tokenizer
|
| 37 |
|
| 38 |
+
classification_model, classification_tokenizer = load_classification_model()
|
| 39 |
+
|
| 40 |
+
id2label = {0: "multas", 1: "politicas_de_privacidad", 2: "contratos", 3: "denuncias", 4: "otros"}
|
| 41 |
+
|
| 42 |
+
def classify_text(text):
|
| 43 |
+
inputs = classification_tokenizer(text, return_tensors="pt", max_length=4096, truncation=True, padding="max_length")
|
| 44 |
+
classification_model.eval()
|
| 45 |
+
with torch.no_grad():
|
| 46 |
+
outputs = classification_model(**inputs)
|
| 47 |
+
logits = outputs.logits
|
| 48 |
+
predicted_class_id = logits.argmax(dim=-1).item()
|
| 49 |
+
predicted_label = id2label[predicted_class_id]
|
| 50 |
+
return f"Clasificaci贸n: {predicted_label}\n\nDocumento:\n{text}"
|
| 51 |
|
| 52 |
+
def translate(text, target_language):
|
| 53 |
+
template = '''
|
| 54 |
+
Por favor, traduzca el siguiente documento al {LANGUAGE}:
|
| 55 |
+
<document>
|
| 56 |
+
{TEXT}
|
| 57 |
+
</document>
|
| 58 |
+
Aseg煤rese de que la traducci贸n sea precisa y conserve el significado original del documento.
|
| 59 |
+
'''
|
| 60 |
+
|
| 61 |
+
formatted_prompt = template.replace("{TEXT}", text).replace("{LANGUAGE}", target_language)
|
| 62 |
+
inputs = tokenizer(formatted_prompt, return_tensors="pt")
|
| 63 |
+
outputs = llm_engine_hf.invoke(formatted_prompt)
|
| 64 |
+
translated_text = outputs.content
|
| 65 |
+
|
| 66 |
+
return translated_text
|
| 67 |
+
|
| 68 |
+
def summarize(text, length):
|
| 69 |
+
template = f'''
|
| 70 |
+
Por favor, haga un resumen {length} del siguiente documento:
|
| 71 |
+
<document>
|
| 72 |
+
{text}
|
| 73 |
+
</document>
|
| 74 |
+
Aseg煤rese de que el resumen sea conciso y conserve el significado original del documento.
|
| 75 |
+
'''
|
| 76 |
+
|
| 77 |
+
inputs = tokenizer(template, return_tensors="pt")
|
| 78 |
+
outputs = llm_engine_hf.invoke(template)
|
| 79 |
+
summarized_text = outputs.content
|
| 80 |
+
|
| 81 |
+
return summarized_text
|
| 82 |
|
| 83 |
def handle_uploaded_file(uploaded_file):
|
| 84 |
try:
|
|
|
|
| 106 |
except Exception as e:
|
| 107 |
return str(e)
|
| 108 |
|
| 109 |
+
st.title("LexAIcon")
|
| 110 |
+
st.write("Puedes conversar con este chatbot basado en Mistral7B-Instruct y subir archivos para que el chatbot los procese.")
|
| 111 |
+
|
| 112 |
+
if "generated" not in st.session_state:
|
| 113 |
+
st.session_state["generated"] = []
|
| 114 |
+
if "past" not in st.session_state:
|
| 115 |
+
st.session_state["past"] = []
|
| 116 |
+
|
| 117 |
# Entrada del usuario
|
| 118 |
user_input = st.text_input("T煤: ", "")
|
| 119 |
|
| 120 |
+
# Opciones para la traducci贸n
|
| 121 |
+
target_language = st.selectbox("Selecciona el idioma de traducci贸n", ["espa帽ol", "ingl茅s", "franc茅s", "alem谩n"])
|
| 122 |
+
|
| 123 |
+
# Opciones para el resumen
|
| 124 |
+
summary_length = st.selectbox("Selecciona la longitud del resumen", ["corto", "medio", "largo"])
|
| 125 |
+
|
| 126 |
# Manejo de archivos subidos
|
| 127 |
uploaded_files = st.file_uploader("Sube un archivo", type=["txt", "pdf", "docx", "csv", "json"], accept_multiple_files=True)
|
| 128 |
|
|
|
|
| 131 |
response = generate_response(user_input)
|
| 132 |
st.session_state.generated.append({"user": user_input, "bot": response})
|
| 133 |
|
| 134 |
+
# Botones de Resumir, Traducir y Explicar
|
| 135 |
+
operation = st.radio("Selecciona una operaci贸n", ["Resumir", "Traducir", "Explicar"])
|
| 136 |
+
|
| 137 |
+
if st.button("Ejecutar"):
|
| 138 |
+
if uploaded_files:
|
| 139 |
+
for uploaded_file in uploaded_files:
|
| 140 |
+
file_content = handle_uploaded_file(uploaded_file)
|
| 141 |
+
if operation == "Resumir":
|
| 142 |
+
if summary_length == "corto":
|
| 143 |
+
length = "de aproximadamente 50 palabras"
|
| 144 |
+
elif summary_length == "medio":
|
| 145 |
+
length = "de aproximadamente 100 palabras"
|
| 146 |
+
elif summary_length == "largo":
|
| 147 |
+
length = "de aproximadamente 500 palabras"
|
| 148 |
+
result = summarize(file_content, length)
|
| 149 |
+
elif operation == "Traducir":
|
| 150 |
+
result = translate(file_content, target_language)
|
| 151 |
+
elif operation == "Explicar":
|
| 152 |
+
result = classify_text(file_content)
|
| 153 |
+
st.write(result)
|
| 154 |
+
|
| 155 |
+
if st.session_state.get("generated"):
|
| 156 |
for chat in st.session_state["generated"]:
|
| 157 |
st.write(f"T煤: {chat['user']}")
|
| 158 |
+
st.write(f"Chatbot: {chat['bot']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|