Add Qdrant and ChromaDB support to the project
- Added Qdrant service to both docker-compose files for production and development. - Updated environment variables in .env.example and settings.py to include Qdrant configuration. - Included necessary dependencies for Qdrant and ChromaDB in requirements.txt. - Updated .gitignore to exclude ChromaDB data files.
This commit is contained in:
@@ -13,3 +13,11 @@ DB_PORT=3306
|
|||||||
|
|
||||||
# Optional: for running manage.py from host (local DB)
|
# Optional: for running manage.py from host (local DB)
|
||||||
# DB_HOST=127.0.0.1
|
# DB_HOST=127.0.0.1
|
||||||
|
|
||||||
|
# Qdrant Vector DB (RAG)
|
||||||
|
QDRANT_HOST=qdrant
|
||||||
|
QDRANT_PORT=6333
|
||||||
|
|
||||||
|
# Avalai Embedding API (OpenAI-compatible)
|
||||||
|
AVALAI_API_KEY=your-avalai-api-key
|
||||||
|
# AVALAI_BASE_URL=https://api.avalai.ir/v1 # optional, default
|
||||||
|
|||||||
@@ -47,6 +47,9 @@ media/
|
|||||||
staticfiles/
|
staticfiles/
|
||||||
*.pot
|
*.pot
|
||||||
|
|
||||||
|
# RAG / ChromaDB
|
||||||
|
data/chromadb/
|
||||||
|
|
||||||
# Testing / Coverage
|
# Testing / Coverage
|
||||||
.coverage
|
.coverage
|
||||||
htmlcov/
|
htmlcov/
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
# پایگاه دانش CropLogic
|
||||||
|
|
||||||
|
فایلهای `.txt` و `.md` این پوشه بهصورت خودکار embed و به Qdrant اضافه میشوند.
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
# دانش پایه خاک برای کشاورزی
|
||||||
|
|
||||||
|
## انواع خاک
|
||||||
|
خاکها بر اساس بافت (نسبت رس، سیلت و شن) دستهبندی میشوند. خاک رسی زهکشی ضعیفتری دارد و خاک شنی زهکشی سریع. خاک لومی ترکیبی متعادل از هر سه است و برای اغلب گیاهان مناسب است.
|
||||||
|
|
||||||
|
## pH خاک
|
||||||
|
مقیاس pH از ۰ تا ۱۴ است؛ مقدار ۷ خنثی است. خاکهای اسیدی (زیر ۷) و قلیایی (بالای ۷) بر جذب عناصر غذایی تأثیر میگذارند. بیشتر گیاهان زراعی pH حدود ۶ تا ۷.۵ را ترجیح میدهند.
|
||||||
|
|
||||||
|
## رطوبت خاک
|
||||||
|
رطوبت خاک بر رشد ریشه و جذب آب و مواد غذایی تأثیر مستقیم دارد. رطوبت بیش از حد باعث خفگی ریشه و کمبود اکسیژن میشود؛ رطوبت کم باعث تنش آبی و کاهش عملکرد میشود.
|
||||||
|
|
||||||
|
## NPK و عناصر غذایی
|
||||||
|
نیتروژن (N) برای رشد سبزینه و برگها ضروری است. فسفر (P) برای ریشهزایی و گلدهی مهم است. پتاسیم (K) مقاومت به خشکی و بیماری را افزایش میدهد. مقادیر این عناصر در خاک با آزمون خاک قابل اندازهگیری است.
|
||||||
|
|
||||||
|
## هدایت الکتریکی (EC)
|
||||||
|
EC نشاندهنده شوری خاک است. EC بالا یعنی نمک زیاد و میتواند به ریشه گیاه آسیب برساند. واحد آن معمولاً dS/m یا mS/cm است.
|
||||||
|
|
||||||
|
## عمق خاک
|
||||||
|
دادههای خاک معمولاً در اعماق ۰–۵، ۵–۱۵ و ۱۵–۳۰ سانتیمتر اندازهگیری میشوند. لایه سطحی برای جوانهزنی و ریشههای سطحی مهم است؛ لایههای عمیقتر برای گیاهان ریشهعمیق اهمیت دارند.
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
# تنظیمات RAG برای پایگاه دانش CropLogic
|
||||||
|
|
||||||
|
embedding:
|
||||||
|
provider: "avalai" # Avalai API (OpenAI-compatible)
|
||||||
|
model: "text-embedding-3-small"
|
||||||
|
base_url: "https://api.avalai.ir/v1"
|
||||||
|
api_key_env: "AVALAI_API_KEY"
|
||||||
|
batch_size: 32
|
||||||
|
|
||||||
|
# فاز یک: Qdrant بهعنوان vector store
|
||||||
|
qdrant:
|
||||||
|
host: "localhost" # یا qdrant در Docker
|
||||||
|
port: 6333
|
||||||
|
collection_name: "croplogic_kb"
|
||||||
|
vector_size: 1536 # متناسب با text-embedding-3-small
|
||||||
|
|
||||||
|
chunking:
|
||||||
|
max_chunk_tokens: 500
|
||||||
|
overlap_tokens: 50
|
||||||
|
|
||||||
|
tone_file: "config/tone.txt"
|
||||||
|
knowledge_base_path: "config/knowledge_base"
|
||||||
|
user_info_path: "config/user_info"
|
||||||
@@ -20,6 +20,7 @@ INSTALLED_APPS = [
|
|||||||
"django.contrib.staticfiles",
|
"django.contrib.staticfiles",
|
||||||
"rest_framework",
|
"rest_framework",
|
||||||
"corsheaders",
|
"corsheaders",
|
||||||
|
"rag",
|
||||||
"tasks",
|
"tasks",
|
||||||
"soil_data",
|
"soil_data",
|
||||||
"sensor_data",
|
"sensor_data",
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
# فایل لحن / سبک پاسخهای RAG
|
||||||
|
|
||||||
|
لحن و سبک پاسخها:
|
||||||
|
- سطح: دوستانه و تخصصی؛ با کشاورز به زبان ساده و علمی صحبت کن.
|
||||||
|
- واژگان: از اصطلاحات رایج کشاورزی و خاکشناسی استفاده کن، در صورت نیاز معادل فارسی بیاور.
|
||||||
|
- طول: پاسخها مختصر و کاربردی؛ در صورت لزوم با بولت یا شماره ساختاربندی کن.
|
||||||
|
- هشدار: اگر موضوع ایمنی یا سلامتی گیاه/خاک باشد، صریحاً هشدار بده.
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
# اطلاعات کاربران
|
||||||
|
|
||||||
|
فایلهای `.txt` و `.md` این پوشه بهعنوان اطلاعات هر کاربر embed و ذخیره میشوند.
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
{
|
||||||
|
"farm": {
|
||||||
|
"name": "مزرعه نمونه گلستان",
|
||||||
|
"location": {
|
||||||
|
"latitude": 36.2,
|
||||||
|
"longitude": 52.5
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"soil_data": {
|
||||||
|
"0-5cm": {
|
||||||
|
"phh2o": 7.2,
|
||||||
|
"clay": 25,
|
||||||
|
"sand": 45,
|
||||||
|
"silt": 30,
|
||||||
|
"soc": 1.4,
|
||||||
|
"nitrogen": 0.12
|
||||||
|
},
|
||||||
|
"5-15cm": {
|
||||||
|
"phh2o": 7.4,
|
||||||
|
"clay": 28,
|
||||||
|
"sand": 42,
|
||||||
|
"silt": 30,
|
||||||
|
"soc": 1.1,
|
||||||
|
"nitrogen": 0.09
|
||||||
|
},
|
||||||
|
"15-30cm": {
|
||||||
|
"phh2o": 7.5,
|
||||||
|
"clay": 30,
|
||||||
|
"sand": 40,
|
||||||
|
"silt": 30,
|
||||||
|
"soc": 0.8,
|
||||||
|
"nitrogen": 0.07
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"sensor_readings": {
|
||||||
|
"soil_moisture": 32,
|
||||||
|
"soil_temperature": 24.5,
|
||||||
|
"soil_ph": 7.1,
|
||||||
|
"electrical_conductivity": 2.1,
|
||||||
|
"nitrogen": 15,
|
||||||
|
"phosphorus": 8,
|
||||||
|
"potassium": 180
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -38,6 +38,16 @@ services:
|
|||||||
container_name: ai-redis
|
container_name: ai-redis
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
qdrant:
|
||||||
|
image: qdrant/qdrant:latest
|
||||||
|
container_name: ai-qdrant
|
||||||
|
ports:
|
||||||
|
- "6333:6333"
|
||||||
|
- "6334:6334"
|
||||||
|
volumes:
|
||||||
|
- qdrant_data:/qdrant/storage
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
web:
|
web:
|
||||||
build: .
|
build: .
|
||||||
container_name: ai-web
|
container_name: ai-web
|
||||||
@@ -47,11 +57,15 @@ services:
|
|||||||
DB_HOST: db
|
DB_HOST: db
|
||||||
CELERY_BROKER_URL: redis://redis:6379/0
|
CELERY_BROKER_URL: redis://redis:6379/0
|
||||||
CELERY_RESULT_BACKEND: redis://redis:6379/0
|
CELERY_RESULT_BACKEND: redis://redis:6379/0
|
||||||
|
QDRANT_HOST: qdrant
|
||||||
|
QDRANT_PORT: 6333
|
||||||
depends_on:
|
depends_on:
|
||||||
db:
|
db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_started
|
condition: service_started
|
||||||
|
qdrant:
|
||||||
|
condition: service_started
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
ports:
|
ports:
|
||||||
- "8020:8000"
|
- "8020:8000"
|
||||||
@@ -75,3 +89,4 @@ services:
|
|||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
ai_mysql_data:
|
ai_mysql_data:
|
||||||
|
qdrant_data:
|
||||||
|
|||||||
@@ -37,6 +37,16 @@ services:
|
|||||||
ports:
|
ports:
|
||||||
- "6380:6379" # host:container — سرویسها داخل شبکه از redis:6379 استفاده میکنند
|
- "6380:6379" # host:container — سرویسها داخل شبکه از redis:6379 استفاده میکنند
|
||||||
|
|
||||||
|
qdrant:
|
||||||
|
image: qdrant/qdrant:latest
|
||||||
|
container_name: ai-qdrant
|
||||||
|
ports:
|
||||||
|
- "6333:6333" # REST API
|
||||||
|
- "6334:6334" # gRPC
|
||||||
|
volumes:
|
||||||
|
- qdrant_data:/qdrant/storage
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
web:
|
web:
|
||||||
build: .
|
build: .
|
||||||
container_name: ai-web
|
container_name: ai-web
|
||||||
@@ -51,11 +61,15 @@ services:
|
|||||||
DB_HOST: db
|
DB_HOST: db
|
||||||
CELERY_BROKER_URL: redis://redis:6379/0
|
CELERY_BROKER_URL: redis://redis:6379/0
|
||||||
CELERY_RESULT_BACKEND: redis://redis:6379/0
|
CELERY_RESULT_BACKEND: redis://redis:6379/0
|
||||||
|
QDRANT_HOST: qdrant
|
||||||
|
QDRANT_PORT: 6333
|
||||||
depends_on:
|
depends_on:
|
||||||
db:
|
db:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_started
|
condition: service_started
|
||||||
|
qdrant:
|
||||||
|
condition: service_started
|
||||||
|
|
||||||
celery:
|
celery:
|
||||||
build: .
|
build: .
|
||||||
@@ -78,3 +92,4 @@ services:
|
|||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
ai_mysql_data:
|
ai_mysql_data:
|
||||||
|
qdrant_data:
|
||||||
|
|||||||
@@ -0,0 +1,7 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class KnowledgeBaseConfig(AppConfig):
|
||||||
|
default_auto_field = "django.db.models.BigAutoField"
|
||||||
|
name = "knowledge_base"
|
||||||
|
verbose_name = "Knowledge Base"
|
||||||
@@ -0,0 +1,202 @@
|
|||||||
|
"""
|
||||||
|
تولید chunk متنی از دادههای sensor_data، soil_data و فایل لحن.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
from django.db.models import Prefetch
|
||||||
|
|
||||||
|
from sensor_data.models import SensorData
|
||||||
|
from soil_data.models import SoilDepthData, SoilLocation
|
||||||
|
|
||||||
|
|
||||||
|
DEPTH_LABELS_FA = {
|
||||||
|
"0-5cm": "۰–۵ سانتیمتر",
|
||||||
|
"5-15cm": "۵–۱۵ سانتیمتر",
|
||||||
|
"15-30cm": "۱۵–۳۰ سانتیمتر",
|
||||||
|
}
|
||||||
|
|
||||||
|
SOIL_FIELD_NAMES_FA = {
|
||||||
|
"bdod": "چگالی توده خاک",
|
||||||
|
"cec": "ظرفیت تبادل کاتیونی",
|
||||||
|
"cfvo": "حجم کسر ریزدانه",
|
||||||
|
"clay": "رس",
|
||||||
|
"nitrogen": "نیتروژن",
|
||||||
|
"ocd": "کربن آلی خاک",
|
||||||
|
"ocs": "ذخیره کربن آلی",
|
||||||
|
"phh2o": "pH خاک",
|
||||||
|
"sand": "ماسه",
|
||||||
|
"silt": "لای",
|
||||||
|
"soc": "کربن آلی خاک",
|
||||||
|
"wv0010": "آب موجود در ۱۰ kPa",
|
||||||
|
"wv0033": "آب موجود در ۳۳ kPa",
|
||||||
|
"wv1500": "آب موجود در ۱۵۰۰ kPa",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _fmt(val: float | None) -> str:
|
||||||
|
if val is None:
|
||||||
|
return "ندارد"
|
||||||
|
return f"{val:.2f}"
|
||||||
|
|
||||||
|
|
||||||
|
def _soil_depth_to_text(depth: SoilDepthData) -> str:
|
||||||
|
"""تبدیل یک SoilDepthData به متن توضیحی."""
|
||||||
|
parts = []
|
||||||
|
for field in ["phh2o", "nitrogen", "clay", "sand", "silt", "cec", "soc", "bdod"]:
|
||||||
|
val = getattr(depth, field, None)
|
||||||
|
if val is not None:
|
||||||
|
name = SOIL_FIELD_NAMES_FA.get(field, field)
|
||||||
|
parts.append(f"{name}={_fmt(val)}")
|
||||||
|
if not parts:
|
||||||
|
return "داده خاک موجود نیست."
|
||||||
|
return "، ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _location_to_text(location: SoilLocation) -> str:
|
||||||
|
"""
|
||||||
|
تبدیل یک SoilLocation به همراه depths و sensor_data به متن.
|
||||||
|
"""
|
||||||
|
lat = float(location.latitude)
|
||||||
|
lon = float(location.longitude)
|
||||||
|
lines = [f"موقعیت جغرافیایی: عرض {lat}، طول {lon}."]
|
||||||
|
|
||||||
|
depths = list(location.depths.order_by("depth_label"))
|
||||||
|
for d in depths:
|
||||||
|
label_fa = DEPTH_LABELS_FA.get(d.depth_label, d.depth_label)
|
||||||
|
lines.append(f"دادههای خاک عمق {label_fa}: {_soil_depth_to_text(d)}.")
|
||||||
|
|
||||||
|
sensors = list(location.sensor_data.all())
|
||||||
|
if sensors:
|
||||||
|
for s in sensors:
|
||||||
|
parts = []
|
||||||
|
if s.soil_moisture is not None:
|
||||||
|
parts.append(f"رطوبت خاک={_fmt(s.soil_moisture)}")
|
||||||
|
if s.soil_temperature is not None:
|
||||||
|
parts.append(f"دما={_fmt(s.soil_temperature)}")
|
||||||
|
if s.soil_ph is not None:
|
||||||
|
parts.append(f"pH={_fmt(s.soil_ph)}")
|
||||||
|
if s.electrical_conductivity is not None:
|
||||||
|
parts.append(f"هدایت الکتریکی={_fmt(s.electrical_conductivity)}")
|
||||||
|
if s.nitrogen is not None:
|
||||||
|
parts.append(f"نیتروژن={_fmt(s.nitrogen)}")
|
||||||
|
if s.phosphorus is not None:
|
||||||
|
parts.append(f"فسفر={_fmt(s.phosphorus)}")
|
||||||
|
if s.potassium is not None:
|
||||||
|
parts.append(f"پتاسیم={_fmt(s.potassium)}")
|
||||||
|
if parts:
|
||||||
|
lines.append(
|
||||||
|
f"داده سنسور (location_id={location.id}): "
|
||||||
|
+ "، ".join(parts)
|
||||||
|
+ "."
|
||||||
|
)
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_tone_file(path: str | Path) -> str:
|
||||||
|
"""بارگذاری محتوای فایل لحن."""
|
||||||
|
path = Path(path)
|
||||||
|
if not path.exists():
|
||||||
|
return ""
|
||||||
|
return path.read_text(encoding="utf-8").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _simple_token_count(text: str) -> int:
|
||||||
|
"""تخمین تعداد توکن با تقسیم بر حدود ۴ کاراکتر."""
|
||||||
|
return max(1, len(text) // 4)
|
||||||
|
|
||||||
|
|
||||||
|
def _chunk_text(
|
||||||
|
text: str,
|
||||||
|
max_tokens: int = 500,
|
||||||
|
overlap_tokens: int = 50,
|
||||||
|
) -> list[str]:
|
||||||
|
"""
|
||||||
|
تقسیم متن به chunkها بر اساس تخمین توکن.
|
||||||
|
از پاراگرافها (خطوط خالی) به عنوان مرز استفاده میکند.
|
||||||
|
"""
|
||||||
|
if not text.strip():
|
||||||
|
return []
|
||||||
|
if _simple_token_count(text) <= max_tokens:
|
||||||
|
return [text.strip()]
|
||||||
|
|
||||||
|
chunks = []
|
||||||
|
paragraphs = re.split(r"\n\s*\n", text)
|
||||||
|
current = []
|
||||||
|
current_tokens = 0
|
||||||
|
|
||||||
|
for para in paragraphs:
|
||||||
|
para = para.strip()
|
||||||
|
if not para:
|
||||||
|
continue
|
||||||
|
pt = _simple_token_count(para)
|
||||||
|
if current_tokens + pt > max_tokens and current:
|
||||||
|
chunks.append("\n\n".join(current))
|
||||||
|
overlap_text = []
|
||||||
|
overlap_sofar = 0
|
||||||
|
for p in reversed(current):
|
||||||
|
if overlap_sofar + _simple_token_count(p) > overlap_tokens:
|
||||||
|
break
|
||||||
|
overlap_text.insert(0, p)
|
||||||
|
overlap_sofar += _simple_token_count(p)
|
||||||
|
current = overlap_text
|
||||||
|
current_tokens = overlap_sofar
|
||||||
|
current.append(para)
|
||||||
|
current_tokens += pt
|
||||||
|
|
||||||
|
if current:
|
||||||
|
chunks.append("\n\n".join(current))
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def iter_soil_chunks() -> Iterator[tuple[str, dict]]:
|
||||||
|
"""
|
||||||
|
تولید chunkهای متنی از soil_data و sensor_data.
|
||||||
|
هر chunk: (text, metadata)
|
||||||
|
"""
|
||||||
|
locations = (
|
||||||
|
SoilLocation.objects.prefetch_related(
|
||||||
|
Prefetch("depths", queryset=SoilDepthData.objects.order_by("depth_label")),
|
||||||
|
"sensor_data",
|
||||||
|
)
|
||||||
|
.order_by("id")
|
||||||
|
)
|
||||||
|
|
||||||
|
for loc in locations:
|
||||||
|
text = _location_to_text(loc)
|
||||||
|
if not text.strip():
|
||||||
|
continue
|
||||||
|
yield text, {
|
||||||
|
"source": "soil_data",
|
||||||
|
"location_id": loc.id,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def iter_tone_chunks(tone_path: str | Path, max_tokens: int = 500, overlap: int = 50) -> Iterator[tuple[str, dict]]:
|
||||||
|
"""تولید chunkهای فایل لحن."""
|
||||||
|
content = _load_tone_file(tone_path)
|
||||||
|
if not content:
|
||||||
|
return
|
||||||
|
for chunk in _chunk_text(content, max_tokens=max_tokens, overlap_tokens=overlap):
|
||||||
|
yield chunk, {"source": "tone"}
|
||||||
|
|
||||||
|
|
||||||
|
def build_all_chunks(
|
||||||
|
tone_path: str | Path,
|
||||||
|
max_chunk_tokens: int = 500,
|
||||||
|
overlap_tokens: int = 50,
|
||||||
|
) -> list[tuple[str, dict]]:
|
||||||
|
"""
|
||||||
|
ساخت همه chunkها از soil_data، sensor_data و فایل لحن.
|
||||||
|
خروجی: لیست (text, metadata)
|
||||||
|
"""
|
||||||
|
out = []
|
||||||
|
for text, meta in iter_soil_chunks():
|
||||||
|
out.append((text, meta))
|
||||||
|
for text, meta in iter_tone_chunks(
|
||||||
|
tone_path, max_tokens=max_chunk_tokens, overlap_tokens=overlap_tokens
|
||||||
|
):
|
||||||
|
out.append((text, meta))
|
||||||
|
return out
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
# نمونه تنظیمات RAG برای پایگاه دانش CropLogic
|
||||||
|
# کپی به rag_config.yaml و در صورت نیاز ویرایش کنید
|
||||||
|
|
||||||
|
embedding:
|
||||||
|
provider: "sentence_transformers" # یا openai
|
||||||
|
model: "paraphrase-multilingual-MiniLM-L12-v2"
|
||||||
|
# برای OpenAI:
|
||||||
|
# provider: "openai"
|
||||||
|
# model: "text-embedding-3-small"
|
||||||
|
# api_key_env: "OPENAI_API_KEY"
|
||||||
|
batch_size: 32
|
||||||
|
|
||||||
|
chromadb:
|
||||||
|
persist_directory: "data/chromadb"
|
||||||
|
collection_name: "croplogic_kb"
|
||||||
|
|
||||||
|
chunking:
|
||||||
|
max_chunk_tokens: 500
|
||||||
|
overlap_tokens: 50
|
||||||
|
|
||||||
|
tone_file: "config/tone.txt"
|
||||||
@@ -0,0 +1,84 @@
|
|||||||
|
"""
|
||||||
|
لایه Embedding سازگار با چند provider (sentence_transformers، openai).
|
||||||
|
"""
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
from .rag_settings import EmbeddingConfig, RAGConfig
|
||||||
|
|
||||||
|
|
||||||
|
class Embedder(Protocol):
|
||||||
|
"""پروتکل embedder."""
|
||||||
|
|
||||||
|
def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]:
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class SentenceTransformerEmbedder:
|
||||||
|
"""Embedder با استفاده از sentence-transformers."""
|
||||||
|
|
||||||
|
def __init__(self, model_name: str):
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
|
||||||
|
self._model = SentenceTransformer(model_name)
|
||||||
|
|
||||||
|
def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]:
|
||||||
|
embeddings = self._model.encode(
|
||||||
|
texts,
|
||||||
|
batch_size=batch_size or 32,
|
||||||
|
show_progress_bar=len(texts) > 50,
|
||||||
|
convert_to_numpy=True,
|
||||||
|
)
|
||||||
|
return embeddings.tolist()
|
||||||
|
|
||||||
|
|
||||||
|
class OpenAIEmbedder:
|
||||||
|
"""Embedder با استفاده از OpenAI API."""
|
||||||
|
|
||||||
|
def __init__(self, model_name: str, api_key: str | None = None):
|
||||||
|
import os
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||||
|
if not key:
|
||||||
|
raise ValueError(
|
||||||
|
"OpenAI API key required. Set OPENAI_API_KEY env or pass api_key."
|
||||||
|
)
|
||||||
|
self._client = OpenAI(api_key=key)
|
||||||
|
self._model = model_name
|
||||||
|
|
||||||
|
def encode(self, texts: list[str], batch_size: int | None = None) -> list[list[float]]:
|
||||||
|
# OpenAI limits batch size (max ~2048 inputs); we use smaller batches
|
||||||
|
batch_size = min(batch_size or 100, 100)
|
||||||
|
all_embeddings = []
|
||||||
|
for i in range(0, len(texts), batch_size):
|
||||||
|
batch = texts[i : i + batch_size]
|
||||||
|
resp = self._client.embeddings.create(
|
||||||
|
model=self._model,
|
||||||
|
input=batch,
|
||||||
|
)
|
||||||
|
for e in resp.data:
|
||||||
|
all_embeddings.append(e.embedding)
|
||||||
|
return all_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedder(config: RAGConfig | EmbeddingConfig) -> Embedder:
|
||||||
|
"""
|
||||||
|
بر اساس config، embedder مناسب را برمیگرداند.
|
||||||
|
"""
|
||||||
|
if isinstance(config, RAGConfig):
|
||||||
|
cfg = config.embedding
|
||||||
|
else:
|
||||||
|
cfg = config
|
||||||
|
|
||||||
|
if cfg.provider == "sentence_transformers":
|
||||||
|
return SentenceTransformerEmbedder(model_name=cfg.model)
|
||||||
|
if cfg.provider == "openai":
|
||||||
|
api_key = None
|
||||||
|
if cfg.api_key_env:
|
||||||
|
import os
|
||||||
|
|
||||||
|
api_key = os.environ.get(cfg.api_key_env)
|
||||||
|
return OpenAIEmbedder(model_name=cfg.model, api_key=api_key)
|
||||||
|
|
||||||
|
raise ValueError(f"Unknown embedding provider: {cfg.provider}")
|
||||||
@@ -0,0 +1,90 @@
|
|||||||
|
"""
|
||||||
|
منطق اصلی indexing: embed کردن chunks و ذخیره در ChromaDB.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .chunks import build_all_chunks
|
||||||
|
from .rag_settings import RAGConfig
|
||||||
|
from .embeddings import get_embedder
|
||||||
|
|
||||||
|
|
||||||
|
COLLECTION_NAME = "croplogic_kb"
|
||||||
|
|
||||||
|
|
||||||
|
def build_index(config: RAGConfig) -> int:
|
||||||
|
"""
|
||||||
|
ساخت/بازسازی کامل index پایگاه دانش.
|
||||||
|
chunks را از soil_data، sensor_data و فایل لحن تولید، embed و در ChromaDB ذخیره میکند.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
تعداد documentهای اضافه شده.
|
||||||
|
"""
|
||||||
|
tone_path = Path(config.tone_file)
|
||||||
|
|
||||||
|
chunks = build_all_chunks(
|
||||||
|
tone_path=tone_path,
|
||||||
|
max_chunk_tokens=config.chunking.max_chunk_tokens,
|
||||||
|
overlap_tokens=config.chunking.overlap_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
texts = [t for t, _ in chunks]
|
||||||
|
metadatas = [m for _, m in chunks]
|
||||||
|
|
||||||
|
# تبدیل metadata به فرمت ChromaDB (فقط str, int, float)
|
||||||
|
def _serialize_meta(m: dict) -> dict:
|
||||||
|
out = {}
|
||||||
|
for k, v in m.items():
|
||||||
|
if v is None:
|
||||||
|
continue
|
||||||
|
if isinstance(v, (str, int, float, bool)):
|
||||||
|
out[k] = v
|
||||||
|
else:
|
||||||
|
out[k] = str(v)
|
||||||
|
return out
|
||||||
|
|
||||||
|
metadatas = [_serialize_meta(m) for m in metadatas]
|
||||||
|
|
||||||
|
embedder = get_embedder(config)
|
||||||
|
batch_size = config.embedding.batch_size
|
||||||
|
|
||||||
|
all_embeddings = []
|
||||||
|
for i in range(0, len(texts), batch_size):
|
||||||
|
batch = texts[i : i + batch_size]
|
||||||
|
embs = embedder.encode(batch, batch_size=batch_size)
|
||||||
|
all_embeddings.extend(embs)
|
||||||
|
|
||||||
|
# ChromaDB
|
||||||
|
persist_dir = Path(config.chromadb.persist_directory)
|
||||||
|
persist_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
import chromadb
|
||||||
|
from chromadb.config import Settings as ChromaSettings
|
||||||
|
|
||||||
|
client = chromadb.PersistentClient(
|
||||||
|
path=str(persist_dir),
|
||||||
|
settings=ChromaSettings(anonymized_telemetry=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
collection_name = config.chromadb.collection_name or COLLECTION_NAME
|
||||||
|
try:
|
||||||
|
client.delete_collection(collection_name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
collection = client.create_collection(
|
||||||
|
name=collection_name,
|
||||||
|
metadata={"hnsw:space": "cosine"},
|
||||||
|
)
|
||||||
|
|
||||||
|
ids = [f"doc_{i}" for i in range(len(texts))]
|
||||||
|
collection.add(
|
||||||
|
ids=ids,
|
||||||
|
embeddings=all_embeddings,
|
||||||
|
documents=texts,
|
||||||
|
metadatas=metadatas,
|
||||||
|
)
|
||||||
|
|
||||||
|
return len(texts)
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
"""
|
||||||
|
دستور CLI برای ساخت index پایگاه دانش.
|
||||||
|
"""
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from knowledge_base.rag_settings import RAGConfig
|
||||||
|
from knowledge_base.indexer import build_index
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "ساخت/بازسازی پایگاه دانش RAG از sensor_data، soil_data و فایل لحن"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
"--config",
|
||||||
|
type=str,
|
||||||
|
default="config/rag_config.yaml",
|
||||||
|
help="مسیر فایل config یامل (پیشفرض: config/rag_config.yaml)",
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
config_path = options["config"]
|
||||||
|
path = Path(config_path)
|
||||||
|
|
||||||
|
if not path.is_absolute():
|
||||||
|
path = Path.cwd() / config_path
|
||||||
|
|
||||||
|
if not path.exists():
|
||||||
|
self.stderr.write(
|
||||||
|
self.style.ERROR(f"فایل config یافت نشد: {path}")
|
||||||
|
)
|
||||||
|
self.stderr.write(
|
||||||
|
"یک فایل config از روی config/rag_config.yaml بسازید یا از config/rag_config.example.yaml کپی کنید."
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
self.stdout.write("در حال بارگذاری config...")
|
||||||
|
config = RAGConfig.load(path)
|
||||||
|
|
||||||
|
self.stdout.write("در حال تولید chunks از soil_data و sensor_data...")
|
||||||
|
count = build_index(config)
|
||||||
|
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(f"پایگاه دانش با {count} سند ساخته شد.")
|
||||||
|
)
|
||||||
@@ -0,0 +1,25 @@
|
|||||||
|
"""
|
||||||
|
ماژول RAG — پایگاه دانش CropLogic
|
||||||
|
فاز یک: Qdrant بهعنوان vector store
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .chunker import chunk_text, chunk_texts
|
||||||
|
from .client import get_qdrant_client
|
||||||
|
from .config import load_rag_config
|
||||||
|
from .embedding import embed_single, embed_texts
|
||||||
|
from .ingest import ingest, load_sources
|
||||||
|
from .retrieve import search_with_query
|
||||||
|
from .vector_store import QdrantVectorStore
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"chunk_text",
|
||||||
|
"chunk_texts",
|
||||||
|
"embed_single",
|
||||||
|
"embed_texts",
|
||||||
|
"get_qdrant_client",
|
||||||
|
"ingest",
|
||||||
|
"load_rag_config",
|
||||||
|
"load_sources",
|
||||||
|
"QdrantVectorStore",
|
||||||
|
"search_with_query",
|
||||||
|
]
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
from django.apps import AppConfig
|
||||||
|
|
||||||
|
|
||||||
|
class RagConfig(AppConfig):
|
||||||
|
default_auto_field = "django.db.models.BigAutoField"
|
||||||
|
name = "rag"
|
||||||
|
verbose_name = "RAG - پایگاه دانش"
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
"""
|
||||||
|
تکهتکه کردن متن (Chunking) برای RAG
|
||||||
|
"""
|
||||||
|
from .config import load_rag_config, RAGConfig
|
||||||
|
|
||||||
|
|
||||||
|
# تقریب: هر توکن حدود ۳–۴ نویسه برای فارسی/انگلیسی
|
||||||
|
CHARS_PER_TOKEN = 3.5
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_text(
|
||||||
|
text: str,
|
||||||
|
config: RAGConfig | None = None,
|
||||||
|
max_chunk_tokens: int | None = None,
|
||||||
|
overlap_tokens: int | None = None,
|
||||||
|
) -> list[str]:
|
||||||
|
"""
|
||||||
|
تکهتکه کردن متن بر اساس توکن (تقریبی با نویسه).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: متن ورودی
|
||||||
|
config: تنظیمات RAG
|
||||||
|
max_chunk_tokens: حداکثر توکن هر چانک (override)
|
||||||
|
overlap_tokens: تعداد توکن همپوشانی بین چانکها (override)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
لیست چانکها
|
||||||
|
"""
|
||||||
|
cfg = config or load_rag_config()
|
||||||
|
max_tok = max_chunk_tokens if max_chunk_tokens is not None else cfg.chunking.max_chunk_tokens
|
||||||
|
overlap = overlap_tokens if overlap_tokens is not None else cfg.chunking.overlap_tokens
|
||||||
|
|
||||||
|
max_chars = int(max_tok * CHARS_PER_TOKEN)
|
||||||
|
overlap_chars = int(overlap * CHARS_PER_TOKEN)
|
||||||
|
step = max_chars - overlap_chars
|
||||||
|
|
||||||
|
if step <= 0:
|
||||||
|
step = max_chars
|
||||||
|
|
||||||
|
text = text.strip()
|
||||||
|
if not text:
|
||||||
|
return []
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = start + max_chars
|
||||||
|
chunk = text[start:end].strip()
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
start += step
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def chunk_texts(
|
||||||
|
texts: list[str],
|
||||||
|
config: RAGConfig | None = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> list[str]:
|
||||||
|
"""چند متن را تکهتکه میکند و همه چانکها را برمیگرداند."""
|
||||||
|
all_chunks: list[str] = []
|
||||||
|
for t in texts:
|
||||||
|
all_chunks.extend(chunk_text(t, config=config, **kwargs))
|
||||||
|
return all_chunks
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
"""
|
||||||
|
کلاینت Qdrant — اتصال به دیتابیس وکتور
|
||||||
|
"""
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.http import models as qmodels
|
||||||
|
|
||||||
|
from .config import QdrantConfig, load_rag_config
|
||||||
|
|
||||||
|
|
||||||
|
def get_qdrant_client(config: QdrantConfig | None = None) -> QdrantClient:
|
||||||
|
"""
|
||||||
|
ایجاد کلاینت Qdrant.
|
||||||
|
اگر config داده نشود، از rag_config.yaml بارگذاری میشود.
|
||||||
|
"""
|
||||||
|
if config is None:
|
||||||
|
rag = load_rag_config()
|
||||||
|
config = rag.qdrant
|
||||||
|
|
||||||
|
return QdrantClient(host=config.host, port=config.port)
|
||||||
@@ -0,0 +1,93 @@
|
|||||||
|
"""
|
||||||
|
بارگذاری تنظیمات RAG از rag_config.yaml
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EmbeddingConfig:
|
||||||
|
provider: str
|
||||||
|
model: str
|
||||||
|
batch_size: int = 32
|
||||||
|
api_key_env: str | None = None
|
||||||
|
base_url: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class QdrantConfig:
|
||||||
|
host: str = "localhost"
|
||||||
|
port: int = 6333
|
||||||
|
collection_name: str = "croplogic_kb"
|
||||||
|
vector_size: int = 384
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ChunkingConfig:
|
||||||
|
max_chunk_tokens: int = 500
|
||||||
|
overlap_tokens: int = 50
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RAGConfig:
|
||||||
|
embedding: EmbeddingConfig
|
||||||
|
qdrant: QdrantConfig
|
||||||
|
chunking: ChunkingConfig
|
||||||
|
tone_file: str = "config/tone.txt"
|
||||||
|
knowledge_base_path: str = "config/knowledge_base"
|
||||||
|
user_info_path: str = "config/user_info"
|
||||||
|
chromadb: dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
def load_rag_config(config_path: str | Path | None = None) -> RAGConfig:
|
||||||
|
"""
|
||||||
|
بارگذاری تنظیمات از YAML و env.
|
||||||
|
QDRANT_HOST و QDRANT_PORT از متغیرهای محیطی override میشوند.
|
||||||
|
"""
|
||||||
|
if config_path is None:
|
||||||
|
base = Path(__file__).resolve().parent.parent
|
||||||
|
config_path = base / "config" / "rag_config.yaml"
|
||||||
|
|
||||||
|
path = Path(config_path)
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"RAG config not found: {path}")
|
||||||
|
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
data = yaml.safe_load(f) or {}
|
||||||
|
|
||||||
|
emb = data.get("embedding", {})
|
||||||
|
embedding = EmbeddingConfig(
|
||||||
|
provider=emb.get("provider", "sentence_transformers"),
|
||||||
|
model=emb.get("model", "text-embedding-3-small"),
|
||||||
|
batch_size=emb.get("batch_size", 32),
|
||||||
|
api_key_env=emb.get("api_key_env"),
|
||||||
|
base_url=emb.get("base_url"),
|
||||||
|
)
|
||||||
|
|
||||||
|
qd = data.get("qdrant", {})
|
||||||
|
qdrant = QdrantConfig(
|
||||||
|
host=os.environ.get("QDRANT_HOST", qd.get("host", "localhost")),
|
||||||
|
port=int(os.environ.get("QDRANT_PORT", qd.get("port", 6333))),
|
||||||
|
collection_name=qd.get("collection_name", "croplogic_kb"),
|
||||||
|
vector_size=qd.get("vector_size", 1536),
|
||||||
|
)
|
||||||
|
|
||||||
|
ch = data.get("chunking", {})
|
||||||
|
chunking = ChunkingConfig(
|
||||||
|
max_chunk_tokens=ch.get("max_chunk_tokens", 500),
|
||||||
|
overlap_tokens=ch.get("overlap_tokens", 50),
|
||||||
|
)
|
||||||
|
|
||||||
|
return RAGConfig(
|
||||||
|
embedding=embedding,
|
||||||
|
qdrant=qdrant,
|
||||||
|
chunking=chunking,
|
||||||
|
tone_file=data.get("tone_file", "config/tone.txt"),
|
||||||
|
knowledge_base_path=data.get("knowledge_base_path", "config/knowledge_base"),
|
||||||
|
user_info_path=data.get("user_info_path", "config/user_info"),
|
||||||
|
chromadb=data.get("chromadb", {}),
|
||||||
|
)
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
"""
|
||||||
|
سرویس تعبیهسازی متن با Avalai API (OpenAI-compatible)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
from typing import overload
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
from .config import load_rag_config, RAGConfig
|
||||||
|
|
||||||
|
|
||||||
|
def _get_avalai_client(config: RAGConfig | None) -> OpenAI:
|
||||||
|
"""ساخت کلاینت OpenAI برای Avalai API."""
|
||||||
|
cfg = config or load_rag_config()
|
||||||
|
emb = cfg.embedding
|
||||||
|
env_var = emb.api_key_env or "AVALAI_API_KEY"
|
||||||
|
api_key = os.environ.get(env_var)
|
||||||
|
base_url = emb.base_url or os.environ.get(
|
||||||
|
"AVALAI_BASE_URL", "https://api.avalai.ir/v1"
|
||||||
|
)
|
||||||
|
return OpenAI(api_key=api_key, base_url=base_url)
|
||||||
|
|
||||||
|
|
||||||
|
def embed_texts(
|
||||||
|
texts: list[str],
|
||||||
|
config: RAGConfig | None = None,
|
||||||
|
model: str | None = None,
|
||||||
|
dimensions: int | None = None,
|
||||||
|
) -> list[list[float]]:
|
||||||
|
"""
|
||||||
|
تعبیهسازی لیست متنها با Avalai.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: لیست رشتههای ورودی
|
||||||
|
config: تنظیمات RAG (پیشفرض: load_rag_config)
|
||||||
|
model: نام مدل (override از config)
|
||||||
|
dimensions: تعداد ابعاد (فقط برای مدلهای پشتیبانیکننده)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
لیست وکتورها
|
||||||
|
"""
|
||||||
|
if not texts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cfg = config or load_rag_config()
|
||||||
|
client = _get_avalai_client(cfg)
|
||||||
|
model_name = model or cfg.embedding.model
|
||||||
|
batch_size = cfg.embedding.batch_size
|
||||||
|
|
||||||
|
all_embeddings: list[list[float]] = []
|
||||||
|
extra = {}
|
||||||
|
if dimensions is not None:
|
||||||
|
extra["dimensions"] = dimensions
|
||||||
|
|
||||||
|
for i in range(0, len(texts), batch_size):
|
||||||
|
batch = texts[i : i + batch_size]
|
||||||
|
resp = client.embeddings.create(
|
||||||
|
model=model_name,
|
||||||
|
input=batch,
|
||||||
|
**extra,
|
||||||
|
)
|
||||||
|
for item in sorted(resp.data, key=lambda x: x.index):
|
||||||
|
all_embeddings.append(item.embedding)
|
||||||
|
|
||||||
|
return all_embeddings
|
||||||
|
|
||||||
|
|
||||||
|
def embed_single(text: str, config: RAGConfig | None = None, **kwargs) -> list[float]:
|
||||||
|
"""تعبیهسازی یک متن. خروجی مستقیماً یک وکتور است."""
|
||||||
|
vecs = embed_texts([text], config=config, **kwargs)
|
||||||
|
return vecs[0] if vecs else []
|
||||||
+148
@@ -0,0 +1,148 @@
|
|||||||
|
"""
|
||||||
|
پایپلاین ورودی RAG: خواندن، چانک، embed و ذخیره در vector store
|
||||||
|
|
||||||
|
سه منبع:
|
||||||
|
۱. لحن (tone)
|
||||||
|
۲. پایگاه دانش (knowledge base)
|
||||||
|
۳. اطلاعات هر کاربر (user info)
|
||||||
|
"""
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .chunker import chunk_text, chunk_texts
|
||||||
|
from .config import load_rag_config, RAGConfig
|
||||||
|
from .embedding import embed_texts
|
||||||
|
from .vector_store import QdrantVectorStore
|
||||||
|
|
||||||
|
# پسوندهای قابل خواندن
|
||||||
|
TEXT_EXTENSIONS = {".txt", ".md", ".rst", ".json"}
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_path(base: Path, p: str) -> Path:
|
||||||
|
"""تبدیل مسیر نسبی به مطلق نسبت به base پروژه."""
|
||||||
|
path = Path(p)
|
||||||
|
if not path.is_absolute():
|
||||||
|
path = base / path
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
|
def _load_file(path: Path) -> str | None:
|
||||||
|
"""خواندن یک فایل متنی."""
|
||||||
|
if not path.exists() or not path.is_file():
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return path.read_text(encoding="utf-8").strip()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _load_files_from_dir(dir_path: Path, prefix: str = "kb") -> list[tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
خواندن همه فایلهای متنی از یک دایرکتوری.
|
||||||
|
Returns: [(source_id, content), ...]
|
||||||
|
"""
|
||||||
|
if not dir_path.exists() or not dir_path.is_dir():
|
||||||
|
return []
|
||||||
|
out: list[tuple[str, str]] = []
|
||||||
|
for f in sorted(dir_path.rglob("*")):
|
||||||
|
if f.is_file() and f.suffix.lower() in TEXT_EXTENSIONS:
|
||||||
|
rel = f.relative_to(dir_path)
|
||||||
|
source_id = f"{prefix}:{rel}"
|
||||||
|
content = _load_file(f)
|
||||||
|
if content:
|
||||||
|
out.append((source_id, content))
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def load_sources(config: RAGConfig | None = None) -> list[tuple[str, str]]:
|
||||||
|
"""
|
||||||
|
بارگذاری سه منبع: لحن، پایگاه دانش، اطلاعات کاربر.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
[(source_id, content), ...]
|
||||||
|
source_id مثال: tone, kb:file.txt, user:profile.txt
|
||||||
|
"""
|
||||||
|
cfg = config or load_rag_config()
|
||||||
|
base = Path(__file__).resolve().parent.parent
|
||||||
|
sources: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
# ۱. لحن
|
||||||
|
tone_path = _resolve_path(base, cfg.tone_file)
|
||||||
|
content = _load_file(tone_path)
|
||||||
|
if content:
|
||||||
|
sources.append(("tone", content))
|
||||||
|
|
||||||
|
# ۲. پایگاه دانش
|
||||||
|
kb_path = _resolve_path(base, cfg.knowledge_base_path)
|
||||||
|
for sid, c in _load_files_from_dir(kb_path, prefix="kb"):
|
||||||
|
sources.append((sid, c))
|
||||||
|
if kb_path.is_file():
|
||||||
|
content = _load_file(kb_path)
|
||||||
|
if content:
|
||||||
|
sources.append((f"kb:{kb_path.name}", content))
|
||||||
|
|
||||||
|
# ۳. اطلاعات کاربر
|
||||||
|
user_path = _resolve_path(base, cfg.user_info_path)
|
||||||
|
for sid, c in _load_files_from_dir(user_path, prefix="user"):
|
||||||
|
sources.append((sid, c))
|
||||||
|
if user_path.is_file():
|
||||||
|
content = _load_file(user_path)
|
||||||
|
if content:
|
||||||
|
sources.append((f"user:{user_path.name}", content))
|
||||||
|
|
||||||
|
return sources
|
||||||
|
|
||||||
|
|
||||||
|
def ingest(recreate: bool = False, config: RAGConfig | None = None) -> dict:
|
||||||
|
"""
|
||||||
|
ورودی کامل: منابع را میخواند، چانک میکند، embed میکند و به vector store میفرستد.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
recreate: اگر True باشد، collection را از نو میسازد
|
||||||
|
config: تنظیمات RAG
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
آمار ورودی (تعداد چانک، منبعها، خطاها)
|
||||||
|
"""
|
||||||
|
cfg = config or load_rag_config()
|
||||||
|
store = QdrantVectorStore(config=cfg)
|
||||||
|
if recreate:
|
||||||
|
store.ensure_collection(recreate=True)
|
||||||
|
|
||||||
|
sources = load_sources(config=cfg)
|
||||||
|
if not sources:
|
||||||
|
return {"chunks_added": 0, "sources": [], "error": "هیچ منبعی یافت نشد"}
|
||||||
|
|
||||||
|
all_chunks: list[str] = []
|
||||||
|
all_metas: list[dict] = []
|
||||||
|
all_ids: list[str] = []
|
||||||
|
|
||||||
|
for source_id, content in sources:
|
||||||
|
chunks = chunk_text(content, config=cfg)
|
||||||
|
for i, ch in enumerate(chunks):
|
||||||
|
uid = str(uuid.uuid4())
|
||||||
|
all_ids.append(uid)
|
||||||
|
all_chunks.append(ch)
|
||||||
|
all_metas.append({"source": source_id, "chunk_index": i})
|
||||||
|
|
||||||
|
if not all_chunks:
|
||||||
|
return {"chunks_added": 0, "sources": [s[0] for s in sources], "error": "هیچ چانکی ساخته نشد"}
|
||||||
|
|
||||||
|
embeddings = embed_texts(all_chunks, config=cfg)
|
||||||
|
if len(embeddings) != len(all_chunks):
|
||||||
|
return {
|
||||||
|
"chunks_added": 0,
|
||||||
|
"sources": [s[0] for s in sources],
|
||||||
|
"error": f"تعداد embed با چانکها مطابقت ندارد: {len(embeddings)} vs {len(all_chunks)}",
|
||||||
|
}
|
||||||
|
|
||||||
|
store.add_documents(
|
||||||
|
ids=all_ids,
|
||||||
|
embeddings=embeddings,
|
||||||
|
documents=all_chunks,
|
||||||
|
metadatas=all_metas,
|
||||||
|
)
|
||||||
|
return {
|
||||||
|
"chunks_added": len(all_chunks),
|
||||||
|
"sources": [s[0] for s in sources],
|
||||||
|
}
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
"""
|
||||||
|
ورودی RAG: لحن، پایگاه دانش و اطلاعات کاربر را embed و به Qdrant میفرستد.
|
||||||
|
اجرا: python manage.py rag_ingest [--recreate]
|
||||||
|
"""
|
||||||
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
from rag.ingest import ingest
|
||||||
|
|
||||||
|
|
||||||
|
class Command(BaseCommand):
|
||||||
|
help = "Embed لحن، پایگاه دانش و اطلاعات کاربر و ذخیره در Qdrant"
|
||||||
|
|
||||||
|
def add_arguments(self, parser):
|
||||||
|
parser.add_argument(
|
||||||
|
"--recreate",
|
||||||
|
action="store_true",
|
||||||
|
help="collection را از نو بساز (حذف و ایجاد مجدد)",
|
||||||
|
)
|
||||||
|
|
||||||
|
def handle(self, *args, **options):
|
||||||
|
recreate = options.get("recreate", False)
|
||||||
|
result = ingest(recreate=recreate)
|
||||||
|
if "error" in result:
|
||||||
|
self.stderr.write(self.style.ERROR(result["error"]))
|
||||||
|
return
|
||||||
|
self.stdout.write(
|
||||||
|
self.style.SUCCESS(
|
||||||
|
f"✓ {result['chunks_added']} چانک از منابع {result['sources']} ذخیره شد."
|
||||||
|
)
|
||||||
|
)
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
"""
|
||||||
|
بازیابی RAG: embed کوئری و جستجو در vector store
|
||||||
|
"""
|
||||||
|
from .config import load_rag_config, RAGConfig
|
||||||
|
from .embedding import embed_single
|
||||||
|
from .vector_store import QdrantVectorStore
|
||||||
|
|
||||||
|
|
||||||
|
def search_with_query(
|
||||||
|
query: str,
|
||||||
|
limit: int = 5,
|
||||||
|
score_threshold: float | None = None,
|
||||||
|
config: RAGConfig | None = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
کوئری را embed میکند و در vector store جستجو میکند.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
لیست نتایج با id, score, text, metadata
|
||||||
|
"""
|
||||||
|
cfg = config or load_rag_config()
|
||||||
|
query_vector = embed_single(query, config=cfg)
|
||||||
|
store = QdrantVectorStore(config=cfg)
|
||||||
|
return store.search(
|
||||||
|
query_vector=query_vector,
|
||||||
|
limit=limit,
|
||||||
|
score_threshold=score_threshold,
|
||||||
|
)
|
||||||
@@ -0,0 +1,117 @@
|
|||||||
|
"""
|
||||||
|
Qdrant Vector Store — ذخیره و جستجوی وکتورها
|
||||||
|
"""
|
||||||
|
from qdrant_client import QdrantClient
|
||||||
|
from qdrant_client.http import models as qmodels
|
||||||
|
|
||||||
|
from .client import get_qdrant_client
|
||||||
|
from .config import load_rag_config, RAGConfig
|
||||||
|
|
||||||
|
|
||||||
|
class QdrantVectorStore:
|
||||||
|
"""
|
||||||
|
ذخیره و جستجوی documents در Qdrant.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, config: RAGConfig | None = None):
|
||||||
|
self.config = config or load_rag_config()
|
||||||
|
self.qdrant = self.config.qdrant
|
||||||
|
self._client: QdrantClient | None = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def client(self) -> QdrantClient:
|
||||||
|
if self._client is None:
|
||||||
|
self._client = get_qdrant_client(self.qdrant)
|
||||||
|
return self._client
|
||||||
|
|
||||||
|
def ensure_collection(self, recreate: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
اطمینان از وجود collection با نام و اندازه مناسب.
|
||||||
|
"""
|
||||||
|
name = self.qdrant.collection_name
|
||||||
|
size = self.qdrant.vector_size
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.client.get_collection(name)
|
||||||
|
if recreate:
|
||||||
|
self.client.delete_collection(name)
|
||||||
|
self.client.create_collection(
|
||||||
|
collection_name=name,
|
||||||
|
vectors_config=qmodels.VectorParams(
|
||||||
|
size=size,
|
||||||
|
distance=qmodels.Distance.COSINE,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
self.client.create_collection(
|
||||||
|
collection_name=name,
|
||||||
|
vectors_config=qmodels.VectorParams(
|
||||||
|
size=size,
|
||||||
|
distance=qmodels.Distance.COSINE,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def add_documents(
|
||||||
|
self,
|
||||||
|
ids: list[str],
|
||||||
|
embeddings: list[list[float]],
|
||||||
|
documents: list[str],
|
||||||
|
metadatas: list[dict] | None = None,
|
||||||
|
) -> int:
|
||||||
|
"""
|
||||||
|
افزودن documents به collection.
|
||||||
|
metadata فقط str, int, float, bool پشتیبانی میشود.
|
||||||
|
"""
|
||||||
|
self.ensure_collection()
|
||||||
|
metas = metadatas or [{}] * len(ids)
|
||||||
|
|
||||||
|
def _serialize(m: dict) -> dict:
|
||||||
|
out = {}
|
||||||
|
for k, v in m.items():
|
||||||
|
if v is None:
|
||||||
|
continue
|
||||||
|
if isinstance(v, (str, int, float, bool)):
|
||||||
|
out[k] = v
|
||||||
|
else:
|
||||||
|
out[k] = str(v)
|
||||||
|
return out
|
||||||
|
|
||||||
|
payloads = [
|
||||||
|
{"text": doc, "doc_id": sid, **_serialize(m)}
|
||||||
|
for doc, m, sid in zip(documents, metas, ids)
|
||||||
|
]
|
||||||
|
|
||||||
|
self.client.upsert(
|
||||||
|
collection_name=self.qdrant.collection_name,
|
||||||
|
points=[
|
||||||
|
qmodels.PointStruct(id=pid, vector=emb, payload=pl)
|
||||||
|
for pid, emb, pl in zip(ids, embeddings, payloads)
|
||||||
|
],
|
||||||
|
)
|
||||||
|
return len(ids)
|
||||||
|
|
||||||
|
def search(
|
||||||
|
self,
|
||||||
|
query_vector: list[float],
|
||||||
|
limit: int = 5,
|
||||||
|
score_threshold: float | None = None,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""
|
||||||
|
جستجوی شباهت بر اساس query vector.
|
||||||
|
"""
|
||||||
|
results = self.client.search(
|
||||||
|
collection_name=self.qdrant.collection_name,
|
||||||
|
query_vector=query_vector,
|
||||||
|
limit=limit,
|
||||||
|
score_threshold=score_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"id": str(r.id),
|
||||||
|
"score": r.score,
|
||||||
|
"text": r.payload.get("text", ""),
|
||||||
|
"metadata": {k: v for k, v in r.payload.items() if k != "text"},
|
||||||
|
}
|
||||||
|
for r in results
|
||||||
|
]
|
||||||
@@ -8,3 +8,7 @@ python-dotenv>=1.0,<2
|
|||||||
celery[redis]>=5.4,<6
|
celery[redis]>=5.4,<6
|
||||||
redis>=5.0,<6
|
redis>=5.0,<6
|
||||||
requests>=2.31,<3
|
requests>=2.31,<3
|
||||||
|
openai>=1.0,<2
|
||||||
|
chromadb>=0.4,<0.5
|
||||||
|
qdrant-client>=1.7,<2
|
||||||
|
pyyaml>=6.0
|
||||||
|
|||||||
Reference in New Issue
Block a user