UPDATE
This commit is contained in:
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
تکهتکه کردن متن (Chunking) برای RAG
|
||||
"""
|
||||
from .config import load_rag_config, RAGConfig
|
||||
|
||||
|
||||
# تقریب: هر توکن حدود ۳–۴ نویسه برای فارسی/انگلیسی
|
||||
CHARS_PER_TOKEN = 3.5
|
||||
|
||||
|
||||
def chunk_text(
|
||||
text: str,
|
||||
config: RAGConfig | None = None,
|
||||
max_chunk_tokens: int | None = None,
|
||||
overlap_tokens: int | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
تکهتکه کردن متن بر اساس توکن (تقریبی با نویسه).
|
||||
|
||||
Args:
|
||||
text: متن ورودی
|
||||
config: تنظیمات RAG
|
||||
max_chunk_tokens: حداکثر توکن هر چانک (override)
|
||||
overlap_tokens: تعداد توکن همپوشانی بین چانکها (override)
|
||||
|
||||
Returns:
|
||||
لیست چانکها
|
||||
"""
|
||||
cfg = config or load_rag_config()
|
||||
max_tok = max_chunk_tokens if max_chunk_tokens is not None else cfg.chunking.max_chunk_tokens
|
||||
overlap = overlap_tokens if overlap_tokens is not None else cfg.chunking.overlap_tokens
|
||||
|
||||
max_chars = int(max_tok * CHARS_PER_TOKEN)
|
||||
overlap_chars = int(overlap * CHARS_PER_TOKEN)
|
||||
step = max_chars - overlap_chars
|
||||
|
||||
if step <= 0:
|
||||
step = max_chars
|
||||
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return []
|
||||
|
||||
chunks: list[str] = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + max_chars
|
||||
chunk = text[start:end].strip()
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
start += step
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def chunk_texts(
|
||||
texts: list[str],
|
||||
config: RAGConfig | None = None,
|
||||
**kwargs,
|
||||
) -> list[str]:
|
||||
"""چند متن را تکهتکه میکند و همه چانکها را برمیگرداند."""
|
||||
all_chunks: list[str] = []
|
||||
for t in texts:
|
||||
all_chunks.extend(chunk_text(t, config=config, **kwargs))
|
||||
return all_chunks
|
||||
Reference in New Issue
Block a user