Spaces:

isl-research
/

books-discovery

Runtime error

App Files Files Community

books-discovery / SmartSearch /hybrid_search.py

teddyllm

Upload 50 files

b917edb verified 10 months ago

raw

history blame contribute delete

3.73 kB

	import os
	from typing import List, Dict, Union, Optional, Any
	import numpy as np

	from .embedding_provider import EmbeddingProvider
	from .database.annoydb import AnnoyDB
	from .keyword_search_provider import KeywordSearchProvider

	class HybridSearch:
	def __init__(
	self,
	embedding_provider: EmbeddingProvider,
	documents: List[str] = None,
	ann_filepath: Optional[str] = None,
	semantic_weight: float = 0.7,
	keyword_weight: float = 0.3
	) -> None:
	self.embedding_provider = embedding_provider
	self.documents = documents

	if ann_filepath and os.path.exists(ann_filepath):
	self.index = AnnoyDB
	self.embeddings = self.embedding_provider.embed_documents(documents)

	self.vector_db = AnnoyDB(
	embedding_dim=self.embeddings.shape[1]
	)

	for emb, doc in zip(self.embeddings, documents):
	self.vector_db.add_data(emb, doc)
	self.vector_db.build()

	# Keyword Search Setup
	self.keyword_search = KeywordSearchProvider(documents)

	# Weights for hybrid search
	self.semantic_weight = semantic_weight
	self.keyword_weight = keyword_weight

	self.documents = documents

	def hybrid_search(self, query: str, top_k: int = 5) -> List[Dict[str, Union[str, float]]]:
	# Embed query
	query_embedding = self.embedding_provider.embed_query(query)

	# Perform semantic search
	semantic_results = self.vector_db.search(query_embedding, top_k)

	# Perform keyword search
	keyword_results = self.keyword_search.search(query, top_k)

	# Combine results with weighted scoring
	combined_results = {}

	for result in semantic_results:
	doc = result['document']
	combined_results[doc] = {
	'semantic_score': result['score'] * self.semantic_weight,
	'keyword_score': 0,
	'hybrid_score': result['score'] * self.semantic_weight
	}

	for result in keyword_results:
	doc = result['document']
	if doc in combined_results:
	combined_results[doc]['keyword_score'] = result['score'] * self.keyword_weight
	combined_results[doc]['hybrid_score'] += result['score'] * self.keyword_weight
	else:
	combined_results[doc] = {
	'semantic_score': 0,
	'keyword_score': result['score'] * self.keyword_weight,
	'hybrid_score': result['score'] * self.keyword_weight
	}

	# Sort and return top results
	sorted_results = sorted(
	[
	{{'document': doc}, scores}
	for doc, scores in combined_results.items()
	],
	key=lambda x: x['hybrid_score'],
	reverse=True
	)

	return sorted_results[:top_k]

	def set_weights(self, semantic_weight: float, keyword_weight: float):
	"""
	Dynamically update search weights.

	Args:
	semantic_weight: New weight for semantic search
	keyword_weight: New weight for keyword search
	"""
	if not (0 <= semantic_weight <= 1 and 0 <= keyword_weight <= 1):
	raise ValueError("Weights must be between 0 and 1")

	if not np.isclose(semantic_weight + keyword_weight, 1.0):
	raise ValueError("Semantic and keyword weights must sum to 1.0")

	self.semantic_weight = semantic_weight
	self.keyword_weight = keyword_weight