langchain-learning-kit/app/utils/text_splitter.py

72 lines
1.9 KiB
Python
Raw Normal View History

"""
用于文档分块的文本分割工具
"""
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document as LangChainDocument
class TextSplitter:
"""
具有可配置参数的 LangChain 文本分割器包装器
"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
"""
初始化文本分割器
Args:
chunk_size: 文本块的最大大小
chunk_overlap: 连续块之间的重叠
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""],
)
def split_text(self, text: str) -> List[str]:
"""
将文本分割成块
Args:
text: 要分割的文本
Returns:
List[str]: 文本块列表
"""
return self.splitter.split_text(text)
def split_documents(self, documents: List[LangChainDocument]) -> List[LangChainDocument]:
"""
将文档分割成块
Args:
documents: LangChain 文档列表
Returns:
List[LangChainDocument]: 分块文档列表
"""
return self.splitter.split_documents(documents)
def create_documents(
self,
texts: List[str],
metadatas: List[dict] = None,
) -> List[LangChainDocument]:
"""
从文本创建 LangChain 文档并分割它们
Args:
texts: 文本列表
metadatas: 可选的元数据字典列表
Returns:
List[LangChainDocument]: 分块文档列表
"""
return self.splitter.create_documents(texts, metadatas=metadatas)