126 lines
4.2 KiB
Python
126 lines
4.2 KiB
Python
"""
|
|
بازیابی RAG: embed کوئری و جستجو در vector store
|
|
"""
|
|
from .config import load_rag_config, RAGConfig, get_service_config
|
|
from .embedding import embed_single, embed_texts
|
|
from .vector_store import QdrantVectorStore
|
|
|
|
|
|
def _resolve_search_options(
|
|
sensor_uuid: str | None = None,
|
|
config: RAGConfig | None = None,
|
|
kb_name: str | None = None,
|
|
service_id: str | None = None,
|
|
use_user_embeddings: bool | None = None,
|
|
) -> tuple[RAGConfig, list[str], list[str]]:
|
|
cfg = config or load_rag_config()
|
|
service = get_service_config(service_id, cfg) if service_id else None
|
|
resolved_kb_name = kb_name or (service.knowledge_base if service else None)
|
|
include_user_embeddings = (
|
|
use_user_embeddings
|
|
if use_user_embeddings is not None
|
|
else (service.use_user_embeddings if service else True)
|
|
)
|
|
|
|
sensor_filters = ["__global__"]
|
|
if include_user_embeddings and sensor_uuid:
|
|
sensor_filters.insert(0, sensor_uuid)
|
|
|
|
kb_filters = [resolved_kb_name] if resolved_kb_name else []
|
|
if include_user_embeddings:
|
|
kb_filters.append("__all__")
|
|
|
|
return cfg, sensor_filters, kb_filters
|
|
|
|
|
|
def search_with_query(
|
|
query: str,
|
|
sensor_uuid: str | None = None,
|
|
limit: int = 5,
|
|
score_threshold: float | None = None,
|
|
config: RAGConfig | None = None,
|
|
kb_name: str | None = None,
|
|
service_id: str | None = None,
|
|
use_user_embeddings: bool | None = None,
|
|
) -> list[dict]:
|
|
"""
|
|
کوئری را embed میکند و در vector store جستجو میکند.
|
|
فقط chunks مربوط به sensor_uuid یا __global__ برمیگردد (ایزولهسازی کاربر).
|
|
kb_name: اختیاری — فیلتر بر اساس پایگاه دانش.
|
|
|
|
Args:
|
|
sensor_uuid: شناسه سنسور کاربر — اجباری برای امنیت
|
|
kb_name: نام پایگاه دانش (chat/irrigation/fertilization)
|
|
|
|
Returns:
|
|
لیست نتایج با id, score, text, metadata
|
|
"""
|
|
cfg, sensor_filters, kb_filters = _resolve_search_options(
|
|
sensor_uuid=sensor_uuid,
|
|
config=config,
|
|
kb_name=kb_name,
|
|
service_id=service_id,
|
|
use_user_embeddings=use_user_embeddings,
|
|
)
|
|
|
|
query_vector = embed_single(query, config=cfg)
|
|
store = QdrantVectorStore(config=cfg)
|
|
return store.search(
|
|
query_vector=query_vector,
|
|
limit=limit,
|
|
score_threshold=score_threshold,
|
|
sensor_uuids=sensor_filters,
|
|
kb_names=kb_filters,
|
|
)
|
|
|
|
|
|
def search_with_texts(
|
|
texts: list[str],
|
|
sensor_uuid: str | None = None,
|
|
limit: int = 8,
|
|
per_text_limit: int = 3,
|
|
score_threshold: float | None = None,
|
|
config: RAGConfig | None = None,
|
|
kb_name: str | None = None,
|
|
service_id: str | None = None,
|
|
use_user_embeddings: bool | None = None,
|
|
) -> list[dict]:
|
|
"""
|
|
چند متن را embed میکند و نتیجه جستجوها را به صورت dedupe شده برمیگرداند.
|
|
برای حالتی مناسب است که هم پیام کاربر و هم دادههای مزرعه را علیه KB جستجو کنیم.
|
|
"""
|
|
normalized_texts = [text.strip() for text in texts if text and text.strip()]
|
|
if not normalized_texts:
|
|
return []
|
|
|
|
cfg, sensor_filters, kb_filters = _resolve_search_options(
|
|
sensor_uuid=sensor_uuid,
|
|
config=config,
|
|
kb_name=kb_name,
|
|
service_id=service_id,
|
|
use_user_embeddings=use_user_embeddings,
|
|
)
|
|
|
|
store = QdrantVectorStore(config=cfg)
|
|
vectors = embed_texts(normalized_texts, config=cfg)
|
|
merged_results: dict[str, dict] = {}
|
|
|
|
for vector in vectors:
|
|
results = store.search(
|
|
query_vector=vector,
|
|
limit=per_text_limit,
|
|
score_threshold=score_threshold,
|
|
sensor_uuids=sensor_filters,
|
|
kb_names=kb_filters,
|
|
)
|
|
for item in results:
|
|
current = merged_results.get(item["id"])
|
|
if current is None or item["score"] > current["score"]:
|
|
merged_results[item["id"]] = item
|
|
|
|
return sorted(
|
|
merged_results.values(),
|
|
key=lambda item: item["score"],
|
|
reverse=True,
|
|
)[:limit]
|