Spaces:
Sleeping
Sleeping
Commit
·
e12639d
1
Parent(s):
6010e28
Simple Recommender complete
Browse files- App/app.py +0 -0
- App/tfidfrecommender.py +4 -1
- app.py +26 -5
App/app.py
DELETED
|
File without changes
|
App/tfidfrecommender.py
CHANGED
|
@@ -82,7 +82,7 @@ class TfidfRecommender :
|
|
| 82 |
lambda x: self.__clean_text(x, for_BERT)
|
| 83 |
)
|
| 84 |
|
| 85 |
-
def tokenize_text (self, ngram_range=(1, 3), min_df=0) :
|
| 86 |
"""Tokenize the input text.
|
| 87 |
|
| 88 |
Args:
|
|
@@ -181,11 +181,14 @@ class TfidfRecommender :
|
|
| 181 |
return self.stop_words
|
| 182 |
|
| 183 |
def recommend_k_items (self, title, k) :
|
|
|
|
| 184 |
idx = self.df[self.df['title'] == title].index[0]
|
|
|
|
| 185 |
cosine_sim = cosine_similarity(self.tfidf_matrix[int(idx)], self.tfidf_matrix)
|
| 186 |
similarity_scores = list(enumerate(cosine_sim[0]))
|
| 187 |
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
|
| 188 |
similarity_scores = similarity_scores[1: k + 1]
|
|
|
|
| 189 |
movie_indices = [i[0] for i in similarity_scores]
|
| 190 |
return self.df.iloc[movie_indices]['id']
|
| 191 |
|
|
|
|
| 82 |
lambda x: self.__clean_text(x, for_BERT)
|
| 83 |
)
|
| 84 |
|
| 85 |
+
def tokenize_text (self, ngram_range=(1, 3), min_df=0.0) :
|
| 86 |
"""Tokenize the input text.
|
| 87 |
|
| 88 |
Args:
|
|
|
|
| 181 |
return self.stop_words
|
| 182 |
|
| 183 |
def recommend_k_items (self, title, k) :
|
| 184 |
+
print("jjj")
|
| 185 |
idx = self.df[self.df['title'] == title].index[0]
|
| 186 |
+
print("ppp")
|
| 187 |
cosine_sim = cosine_similarity(self.tfidf_matrix[int(idx)], self.tfidf_matrix)
|
| 188 |
similarity_scores = list(enumerate(cosine_sim[0]))
|
| 189 |
similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
|
| 190 |
similarity_scores = similarity_scores[1: k + 1]
|
| 191 |
+
print("lol")
|
| 192 |
movie_indices = [i[0] for i in similarity_scores]
|
| 193 |
return self.df.iloc[movie_indices]['id']
|
| 194 |
|
app.py
CHANGED
|
@@ -10,24 +10,45 @@ desc = pd.read_csv('App/data/descriptions.csv')
|
|
| 10 |
|
| 11 |
rec = TfidfRecommender(desc, 'id', 'description' , "none")
|
| 12 |
def initialize_and_tokenize(tokenizer):
|
|
|
|
| 13 |
rec.tokenization_method = tokenizer
|
| 14 |
rec.tokenize_text()
|
| 15 |
|
| 16 |
names = []
|
| 17 |
def recommend (movies, tok) :
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
pool = concurrent.futures.ThreadPoolExecutor(max_workers=10)
|
| 20 |
futures = [pool.submit(rec.recommend_k_items, movie, 5) for movie in movies]
|
| 21 |
-
idss = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
ids = [id for ids in idss for id in ids]
|
| 23 |
ids = list(set(ids))
|
| 24 |
names = desc[desc['id'].isin(ids)]['title'].to_list()
|
| 25 |
return ', '.join(names)
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
gr.Radio(["bert", "scibert", "nltk" , "none"], value="none", label="Tokenization and text preprocess")],
|
| 30 |
-
outputs=gr.Textbox())
|
| 31 |
demo.launch()
|
| 32 |
|
| 33 |
|
|
|
|
| 10 |
|
| 11 |
rec = TfidfRecommender(desc, 'id', 'description' , "none")
|
| 12 |
def initialize_and_tokenize(tokenizer):
|
| 13 |
+
print("tok")
|
| 14 |
rec.tokenization_method = tokenizer
|
| 15 |
rec.tokenize_text()
|
| 16 |
|
| 17 |
names = []
|
| 18 |
def recommend (movies, tok) :
|
| 19 |
+
rec.tokenization_method = tok
|
| 20 |
+
tf, vecs = rec.tokenize_text()
|
| 21 |
+
rec.fit(tf, vecs)
|
| 22 |
+
print("rec")
|
| 23 |
pool = concurrent.futures.ThreadPoolExecutor(max_workers=10)
|
| 24 |
futures = [pool.submit(rec.recommend_k_items, movie, 5) for movie in movies]
|
| 25 |
+
idss = []
|
| 26 |
+
print("after submit")
|
| 27 |
+
for i in range(len(futures)):
|
| 28 |
+
print("res")
|
| 29 |
+
idss.append(futures[i].result())
|
| 30 |
+
print("shutdown")
|
| 31 |
+
pool.shutdown(wait=True)
|
| 32 |
ids = [id for ids in idss for id in ids]
|
| 33 |
ids = list(set(ids))
|
| 34 |
names = desc[desc['id'].isin(ids)]['title'].to_list()
|
| 35 |
return ', '.join(names)
|
| 36 |
|
| 37 |
+
def recom(movies, tok):
|
| 38 |
+
rec.tokenization_method = tok
|
| 39 |
+
tf, vecs = rec.tokenize_text()
|
| 40 |
+
rec.fit(tf, vecs)
|
| 41 |
+
print(movies[0])
|
| 42 |
+
ids = rec.recommend_k_items(movies[0], 5)
|
| 43 |
+
print("reccc")
|
| 44 |
+
# ids = list(set(ids))
|
| 45 |
+
names = desc[desc['id'].isin(ids)]['title'].to_list()
|
| 46 |
+
return ', '.join(names)
|
| 47 |
+
|
| 48 |
+
demo = gr.Interface(fn=recom,
|
| 49 |
+
inputs=[gr.Dropdown(choices = list(desc['title'][:20]), multiselect=True, max_choices=3, label="Movies"),
|
| 50 |
gr.Radio(["bert", "scibert", "nltk" , "none"], value="none", label="Tokenization and text preprocess")],
|
| 51 |
+
outputs=gr.Textbox(label="Recommended"))
|
| 52 |
demo.launch()
|
| 53 |
|
| 54 |
|