Ankerkraut commited on
Commit
c5c778a
·
1 Parent(s): da45a72
Files changed (1) hide show
  1. app.py +99 -109
app.py CHANGED
@@ -12,116 +12,106 @@ For more information on `huggingface_hub` Inference API support, please check th
12
  """
13
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
14
 
15
- def init():
16
- product_strings = []
17
- with open('../Data/products.json', 'r', encoding='utf-8') as f:
18
- for product in json.load(f)['products']:
19
- product_json = json.dumps(product, indent=4, ensure_ascii=False)
20
- tags_ohne = [tag for tag in product['tags'] if "Eigenschaften_ohne" in tag]
21
- tags_ohne = [tag.split(" ")[1] for tag in tags_ohne]
22
- tags_zutaten = [tag for tag in product['tags'] if "Zutaten_" in tag]
23
- tags_zutaten = [tag.split("_")[1] for tag in tags_zutaten]
24
- tags_geeignet = [tag for tag in product['tags'] if "Geeignet zum_" in tag]
25
- tags_geeignet = [tag.split("_")[1] for tag in tags_geeignet]
26
- tags_landestypisch = [tag for tag in product['tags'] if "Landestypisch für_" in tag]
27
- tags_landestypisch = [tag.split("_")[1] for tag in tags_landestypisch]
28
- tags_geschmack = [tag for tag in product['tags'] if "Geschmack_" in tag]
29
- tags_geschmack = [tag.split("_")[1] for tag in tags_geschmack]
30
- tags_passtzu = [tag for tag in product['tags'] if "Passt zu_" in tag]
31
- tags_passtzu = [tag.split("_")[1] for tag in tags_passtzu]
32
- tags_merkmale = [tag for tag in product['tags'] if "Merkmale_" in tag]
33
- tags_merkmale = [tag.split("_")[1] for tag in tags_merkmale]
34
- tags_sonstige = [tag for tag in product['tags'] if not any(sub in tag for sub in ["Eigenschaften_ohne", "Zutaten_", "Geeignet zum_", "Landestypisch für_", "Geschmack_", "Passt zu_", "Merkmale_"])]
35
- tags_ohne_str = ",".join(tags_ohne) if tags_ohne else "nicht bekannt"
36
- tags_zutaten = ",".join(tags_zutaten) if tags_zutaten else "nicht bekannt"
37
- tags_geeignet = ",".join(tags_geeignet) if tags_geeignet else "nicht bekannt"
38
- tags_landestypisch = ",".join(tags_landestypisch) if tags_landestypisch else "nicht bekannt"
39
- tags_geschmack = ",".join(tags_geschmack) if tags_geschmack else "nicht bekannt"
40
- tags_passtzu = ",".join(tags_passtzu) if tags_passtzu else "nicht bekannt"
41
- tags_merkmale = ",".join(tags_merkmale) if tags_merkmale else "nicht bekannt"
42
- tags_sonstige = ",".join(tags_sonstige) if tags_sonstige else "nicht bekannt"
43
-
44
- product_string = f"""{product["title"]}; Beschreibung: {product["description"]}; Eigenschaften: Ohne: {tags_ohne_str}; Zutaten: {tags_zutaten}; Geeignet zum: {tags_geeignet}; Landestypisch für: {tags_landestypisch}; Geschmack: {tags_geschmack}; Passt zu: {tags_passtzu}; Merkmale: {tags_merkmale}; Sonstige: {tags_sonstige}; Erstellt am: {product["createdAt"]}"""
45
- product_strings.append(product_string)
46
- product_strings
47
- blogs = []
48
- recipe_strings = []
49
- with open('../Data/blogs_and_recipes.json', 'r', encoding='utf-8') as f:
50
- data = json.load(f)
51
- for blog in data['blogs']:
52
- if 'Rezepte' in blog['title']:
53
- for recipe in blog['articles']:
54
- new_recipe = ""
55
- recipe["body"] = bs4.BeautifulSoup(recipe["body"], 'html.parser')
56
- for metafield in recipe['metafields']:
57
- if metafield['namespace'] == 'recipekit':
58
- metafield['value'] = bs4.BeautifulSoup(metafield['value'], 'html.parser')
59
- value_json = json.loads(metafield['value'].text.replace(",,", ",").replace(",]", "]").replace(",}", "}").strip(","))
60
- title = value_json['recipe_title']
61
- description = value_json['recipe_description']
62
- ingredients = value_json['recipe_ingredients']
63
- category = value_json.get('recipe_category', 'Unbekannt')
64
- cuisine = value_json.get('recipe_cuisine', 'Unbekannt')
65
- ingredients = [ingredient['ingredient'] for ingredient in value_json['recipe_ingredients'] if 'ingredient' in ingredient]
66
- directions = [direction['direction'] for direction in value_json['recipe_directions']]
67
- serving_size = value_json['serving_size']
68
- prep_time = value_json['prep_time']
69
- cook_time = value_json['cook_time']
70
- rating = value_json.get('recipe_rating', 'Keine Bewertung')
71
- new_recipe = f"Titel:{title},\n{description}\nZutaten:{','.join(ingredients)},\nAnweisungen:{' '.join(directions)},\nKategorie:{category},\nKüche:{cuisine},\nPortionen:{serving_size},\nVorbereitungszeit:{prep_time},\nKochzeit:{cook_time},\nBewertung:{rating}"
72
- recipe_strings.append(new_recipe)
73
- recipe_strings
74
-
75
- client = QdrantClient(":memory:") #QdrantClient("localhost:6333")
76
- client.set_model("sentence-transformers/all-MiniLM-L6-v2")
77
- client.set_sparse_model("prithivida/Splade_PP_en_v1")
78
-
79
- client.delete_collection(collection_name="products")
80
- client.create_collection(
81
- collection_name="products",
82
- vectors_config=client.get_fastembed_vector_params(),
83
- sparse_vectors_config=client.get_fastembed_sparse_vector_params(),
84
- )
85
- client.delete_collection(collection_name="recipes")
86
- client.create_collection(
87
- collection_name="recipes",
88
- vectors_config=client.get_fastembed_vector_params(),
89
- sparse_vectors_config=client.get_fastembed_sparse_vector_params(),
90
- )
91
-
92
- client.add(collection_name="products",
93
- documents=product_strings)
94
-
95
- client.add(collection_name="recipes",
96
- documents=recipe_strings)
97
-
98
- model_name = "LeoLM/leo-hessianai-13b-chat"
99
-
100
- bnb_config = BitsAndBytesConfig(
101
- load_in_4bit=True, # Use 4-bit quantization
102
- bnb_4bit_compute_dtype=torch.float16, # Reduce memory usage
103
- bnb_4bit_use_double_quant=True,
104
- llm_int8_enable_fp32_cpu_offload=True
105
- )
106
-
107
- ankerbot_model = AutoModelForCausalLM.from_pretrained(
108
- model_name,
109
- quantization_config=bnb_config,
110
- device_map="cuda:0",
111
- torch_dtype=torch.float16,
112
- use_cache=True,
113
- offload_folder="../offload"
114
- )
115
-
116
- ankerbot_tokenizer = AutoTokenizer.from_pretrained(model_name,
117
- torch_dtype=torch.float16,
118
- truncation=True,
119
- padding=True, )
120
-
121
- prompt_format = "<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
122
 
123
- generator = pipeline(task="text-generation", model=ankerbot_model, tokenizer=ankerbot_tokenizer, torch_dtype=torch.float16, trust_remote_code=True) # True for flash-attn2 else False
124
- generator_mini = pipeline(task="text-generation", model=ankerbot_model, tokenizer=ankerbot_tokenizer, torch_dtype=torch.float16, trust_remote_code=True) # True for flash-attn2 else False
 
 
 
 
 
125
 
126
  def generate_response(query, context, prompts, max_tokens, temperature, top_p):
127
  system_message_support = f"""<|im_start|>system
 
12
  """
13
  client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
14
 
15
+ product_strings = []
16
+ with open('../Data/products.json', 'r', encoding='utf-8') as f:
17
+ for product in json.load(f)['products']:
18
+ product_json = json.dumps(product, indent=4, ensure_ascii=False)
19
+ tags_ohne = [tag for tag in product['tags'] if "Eigenschaften_ohne" in tag]
20
+ tags_ohne = [tag.split(" ")[1] for tag in tags_ohne]
21
+ tags_zutaten = [tag for tag in product['tags'] if "Zutaten_" in tag]
22
+ tags_zutaten = [tag.split("_")[1] for tag in tags_zutaten]
23
+ tags_geeignet = [tag for tag in product['tags'] if "Geeignet zum_" in tag]
24
+ tags_geeignet = [tag.split("_")[1] for tag in tags_geeignet]
25
+ tags_landestypisch = [tag for tag in product['tags'] if "Landestypisch für_" in tag]
26
+ tags_landestypisch = [tag.split("_")[1] for tag in tags_landestypisch]
27
+ tags_geschmack = [tag for tag in product['tags'] if "Geschmack_" in tag]
28
+ tags_geschmack = [tag.split("_")[1] for tag in tags_geschmack]
29
+ tags_passtzu = [tag for tag in product['tags'] if "Passt zu_" in tag]
30
+ tags_passtzu = [tag.split("_")[1] for tag in tags_passtzu]
31
+ tags_merkmale = [tag for tag in product['tags'] if "Merkmale_" in tag]
32
+ tags_merkmale = [tag.split("_")[1] for tag in tags_merkmale]
33
+ tags_sonstige = [tag for tag in product['tags'] if not any(sub in tag for sub in ["Eigenschaften_ohne", "Zutaten_", "Geeignet zum_", "Landestypisch für_", "Geschmack_", "Passt zu_", "Merkmale_"])]
34
+ tags_ohne_str = ",".join(tags_ohne) if tags_ohne else "nicht bekannt"
35
+ tags_zutaten = ",".join(tags_zutaten) if tags_zutaten else "nicht bekannt"
36
+ tags_geeignet = ",".join(tags_geeignet) if tags_geeignet else "nicht bekannt"
37
+ tags_landestypisch = ",".join(tags_landestypisch) if tags_landestypisch else "nicht bekannt"
38
+ tags_geschmack = ",".join(tags_geschmack) if tags_geschmack else "nicht bekannt"
39
+ tags_passtzu = ",".join(tags_passtzu) if tags_passtzu else "nicht bekannt"
40
+ tags_merkmale = ",".join(tags_merkmale) if tags_merkmale else "nicht bekannt"
41
+ tags_sonstige = ",".join(tags_sonstige) if tags_sonstige else "nicht bekannt"
42
+ product_string = f"""{product["title"]}; Beschreibung: {product["description"]}; Eigenschaften: Ohne: {tags_ohne_str}; Zutaten: {tags_zutaten}; Geeignet zum: {tags_geeignet}; Landestypisch für: {tags_landestypisch}; Geschmack: {tags_geschmack}; Passt zu: {tags_passtzu}; Merkmale: {tags_merkmale}; Sonstige: {tags_sonstige}; Erstellt am: {product["createdAt"]}"""
43
+ product_strings.append(product_string)
44
+ product_strings
45
+ blogs = []
46
+ recipe_strings = []
47
+ with open('../Data/blogs_and_recipes.json', 'r', encoding='utf-8') as f:
48
+ data = json.load(f)
49
+ for blog in data['blogs']:
50
+ if 'Rezepte' in blog['title']:
51
+ for recipe in blog['articles']:
52
+ new_recipe = ""
53
+ recipe["body"] = bs4.BeautifulSoup(recipe["body"], 'html.parser')
54
+ for metafield in recipe['metafields']:
55
+ if metafield['namespace'] == 'recipekit':
56
+ metafield['value'] = bs4.BeautifulSoup(metafield['value'], 'html.parser')
57
+ value_json = json.loads(metafield['value'].text.replace(",,", ",").replace(",]", "]").replace(",}", "}").strip(","))
58
+ title = value_json['recipe_title']
59
+ description = value_json['recipe_description']
60
+ ingredients = value_json['recipe_ingredients']
61
+ category = value_json.get('recipe_category', 'Unbekannt')
62
+ cuisine = value_json.get('recipe_cuisine', 'Unbekannt')
63
+ ingredients = [ingredient['ingredient'] for ingredient in value_json['recipe_ingredients'] if 'ingredient' in ingredient]
64
+ directions = [direction['direction'] for direction in value_json['recipe_directions']]
65
+ serving_size = value_json['serving_size']
66
+ prep_time = value_json['prep_time']
67
+ cook_time = value_json['cook_time']
68
+ rating = value_json.get('recipe_rating', 'Keine Bewertung')
69
+ new_recipe = f"Titel:{title},\n{description}\nZutaten:{','.join(ingredients)},\nAnweisungen:{' '.join(directions)},\nKategorie:{category},\nKüche:{cuisine},\nPortionen:{serving_size},\nVorbereitungszeit:{prep_time},\nKochzeit:{cook_time},\nBewertung:{rating}"
70
+ recipe_strings.append(new_recipe)
71
+ recipe_strings
72
+
73
+ client = QdrantClient(":memory:") #QdrantClient("localhost:6333")
74
+ client.set_model("sentence-transformers/all-MiniLM-L6-v2")
75
+ client.set_sparse_model("prithivida/Splade_PP_en_v1")
76
+ client.delete_collection(collection_name="products")
77
+ client.create_collection(
78
+ collection_name="products",
79
+ vectors_config=client.get_fastembed_vector_params(),
80
+ sparse_vectors_config=client.get_fastembed_sparse_vector_params(),
81
+ )
82
+ client.delete_collection(collection_name="recipes")
83
+ client.create_collection(
84
+ collection_name="recipes",
85
+ vectors_config=client.get_fastembed_vector_params(),
86
+ sparse_vectors_config=client.get_fastembed_sparse_vector_params(),
87
+ )
88
+ client.add(collection_name="products",
89
+ documents=product_strings)
90
+ client.add(collection_name="recipes",
91
+ documents=recipe_strings)
92
+ model_name = "LeoLM/leo-hessianai-13b-chat"
93
+ bnb_config = BitsAndBytesConfig(
94
+ load_in_4bit=True, # Use 4-bit quantization
95
+ bnb_4bit_compute_dtype=torch.float16, # Reduce memory usage
96
+ bnb_4bit_use_double_quant=True,
97
+ llm_int8_enable_fp32_cpu_offload=True
98
+ )
99
+ ankerbot_model = AutoModelForCausalLM.from_pretrained(
100
+ model_name,
101
+ quantization_config=bnb_config,
102
+ device_map="cuda:0",
103
+ torch_dtype=torch.float16,
104
+ use_cache=True,
105
+ offload_folder="../offload"
106
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ ankerbot_tokenizer = AutoTokenizer.from_pretrained(model_name,
109
+ torch_dtype=torch.float16,
110
+ truncation=True,
111
+ padding=True, )
112
+ prompt_format = "<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
113
+ generator = pipeline(task="text-generation", model=ankerbot_model, tokenizer=ankerbot_tokenizer, torch_dtype=torch.float16, trust_remote_code=True) # True for flash-attn2 else False
114
+ generator_mini = pipeline(task="text-generation", model=ankerbot_model, tokenizer=ankerbot_tokenizer, torch_dtype=torch.float16, trust_remote_code=True) # True for flash-attn2 else False
115
 
116
  def generate_response(query, context, prompts, max_tokens, temperature, top_p):
117
  system_message_support = f"""<|im_start|>system