Spaces:
Runtime error
Runtime error
makcrx
commited on
Commit
·
9647155
1
Parent(s):
3334b06
update db
Browse files- app.py +1 -1
- faiss_qa_2023-08-09/index.faiss +3 -0
- faiss_qa_2023-08-09/index.pkl +3 -0
- test.ipynb +12 -10
- test.py +64 -0
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import reranking
|
|
| 5 |
from extract_keywords import init_keyword_extractor, extract_keywords
|
| 6 |
|
| 7 |
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
|
| 8 |
-
db = FAISS.load_local('
|
| 9 |
init_keyword_extractor()
|
| 10 |
|
| 11 |
def main(query):
|
|
|
|
| 5 |
from extract_keywords import init_keyword_extractor, extract_keywords
|
| 6 |
|
| 7 |
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-MiniLM-L6-cos-v1")
|
| 8 |
+
db = FAISS.load_local('faiss_qa_2023-08-09', embeddings)
|
| 9 |
init_keyword_extractor()
|
| 10 |
|
| 11 |
def main(query):
|
faiss_qa_2023-08-09/index.faiss
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7690fb7aa21b8d325e3ce1a9f8fb241dc597aa06df042bf242c522433243b93f
|
| 3 |
+
size 576045
|
faiss_qa_2023-08-09/index.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:09f3705caef79861035ed026855cefb872c386e87938d73a5deb57479507364d
|
| 3 |
+
size 265781
|
test.ipynb
CHANGED
|
@@ -9,6 +9,7 @@
|
|
| 9 |
"import sqlite3, json\n",
|
| 10 |
"from contextlib import closing\n",
|
| 11 |
"\n",
|
|
|
|
| 12 |
"def load_questions(sqlite_filename):\n",
|
| 13 |
" all_questions = []\n",
|
| 14 |
" with closing(sqlite3.connect(sqlite_filename)) as db:\n",
|
|
@@ -82,30 +83,31 @@
|
|
| 82 |
},
|
| 83 |
{
|
| 84 |
"cell_type": "code",
|
| 85 |
-
"execution_count":
|
| 86 |
"metadata": {},
|
| 87 |
"outputs": [
|
| 88 |
{
|
| 89 |
"name": "stderr",
|
| 90 |
"output_type": "stream",
|
| 91 |
"text": [
|
|
|
|
|
|
|
| 92 |
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
| 93 |
" warnings.warn(\n"
|
| 94 |
]
|
| 95 |
},
|
| 96 |
{
|
| 97 |
-
"
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
]
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
"metadata": {},
|
| 104 |
-
"output_type": "execute_result"
|
| 105 |
}
|
| 106 |
],
|
| 107 |
"source": [
|
| 108 |
-
"extract_keywords('
|
|
|
|
| 109 |
]
|
| 110 |
},
|
| 111 |
{
|
|
|
|
| 9 |
"import sqlite3, json\n",
|
| 10 |
"from contextlib import closing\n",
|
| 11 |
"\n",
|
| 12 |
+
"# use test.py to update questions in db!!!\n",
|
| 13 |
"def load_questions(sqlite_filename):\n",
|
| 14 |
" all_questions = []\n",
|
| 15 |
" with closing(sqlite3.connect(sqlite_filename)) as db:\n",
|
|
|
|
| 83 |
},
|
| 84 |
{
|
| 85 |
"cell_type": "code",
|
| 86 |
+
"execution_count": 5,
|
| 87 |
"metadata": {},
|
| 88 |
"outputs": [
|
| 89 |
{
|
| 90 |
"name": "stderr",
|
| 91 |
"output_type": "stream",
|
| 92 |
"text": [
|
| 93 |
+
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
| 94 |
+
" warnings.warn(\n",
|
| 95 |
"/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n",
|
| 96 |
" warnings.warn(\n"
|
| 97 |
]
|
| 98 |
},
|
| 99 |
{
|
| 100 |
+
"name": "stdout",
|
| 101 |
+
"output_type": "stream",
|
| 102 |
+
"text": [
|
| 103 |
+
"['яндекс доставка экспресс']\n",
|
| 104 |
+
"[]\n"
|
| 105 |
+
]
|
|
|
|
|
|
|
| 106 |
}
|
| 107 |
],
|
| 108 |
"source": [
|
| 109 |
+
"print(extract_keywords('яд экспресс'))\n",
|
| 110 |
+
"print(extract_keywords('яндекс.доставка'))"
|
| 111 |
]
|
| 112 |
},
|
| 113 |
{
|
test.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sqlite3, json
|
| 2 |
+
from contextlib import closing
|
| 3 |
+
|
| 4 |
+
# change THIS
|
| 5 |
+
output_dir = 'faiss_qa_2023-08-09'
|
| 6 |
+
model_name = "multi-qa-MiniLM-L6-cos-v1"
|
| 7 |
+
|
| 8 |
+
punctuation = '!"#\'(),:;?[]^`}{'
|
| 9 |
+
punctuation2 = '-/&._~+*=@<>[]\\'
|
| 10 |
+
remove_punctuation = str.maketrans(punctuation2, ' ' * len(punctuation2), punctuation)
|
| 11 |
+
|
| 12 |
+
def add_special_questions(questions):
|
| 13 |
+
questions.append({
|
| 14 |
+
"question": "Позови человека/менеджера",
|
| 15 |
+
"query": "Позови человека/менеджера",
|
| 16 |
+
"answer": "Переключаю на сотрудника, ожидайте",
|
| 17 |
+
"articleId": 0,
|
| 18 |
+
})
|
| 19 |
+
|
| 20 |
+
def load_questions(sqlite_filename):
|
| 21 |
+
all_questions = []
|
| 22 |
+
with closing(sqlite3.connect(sqlite_filename)) as db:
|
| 23 |
+
db.row_factory = sqlite3.Row
|
| 24 |
+
with closing(db.cursor()) as cursor:
|
| 25 |
+
results = cursor.execute(
|
| 26 |
+
"SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0",
|
| 27 |
+
('article',)
|
| 28 |
+
).fetchall()
|
| 29 |
+
|
| 30 |
+
for res in results:
|
| 31 |
+
|
| 32 |
+
questions = json.loads(res['questions'])
|
| 33 |
+
for q in questions:
|
| 34 |
+
q['query'] = " ".join(res['section'].split() + res['title'].split() + q['question'].split()).translate(remove_punctuation).lower()
|
| 35 |
+
q['articleId'] = res['articleId']
|
| 36 |
+
all_questions += questions
|
| 37 |
+
|
| 38 |
+
add_special_questions(all_questions)
|
| 39 |
+
|
| 40 |
+
return all_questions
|
| 41 |
+
|
| 42 |
+
print("Loading questions from db...")
|
| 43 |
+
questions = load_questions("omnidesk-ai-chatgpt-questions.sqlite")
|
| 44 |
+
|
| 45 |
+
# print(questions[0])
|
| 46 |
+
|
| 47 |
+
from langchain.vectorstores import FAISS
|
| 48 |
+
from langchain.docstore.document import Document
|
| 49 |
+
from langchain.embeddings import SentenceTransformerEmbeddings
|
| 50 |
+
|
| 51 |
+
docs = [
|
| 52 |
+
Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] })
|
| 53 |
+
for q in questions
|
| 54 |
+
]
|
| 55 |
+
|
| 56 |
+
print(f"Loading embeddings model {model_name}...")
|
| 57 |
+
embeddings = SentenceTransformerEmbeddings(model_name=model_name)
|
| 58 |
+
|
| 59 |
+
print("embedding documents...")
|
| 60 |
+
|
| 61 |
+
db = FAISS.from_documents(docs, embeddings)
|
| 62 |
+
db.save_local(output_dir)
|
| 63 |
+
|
| 64 |
+
print('Saved!')
|