72 lines
1.9 KiB
Python
72 lines
1.9 KiB
Python
"""
|
|
用于文档分块的文本分割工具。
|
|
"""
|
|
from typing import List
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
from langchain_core.documents import Document as LangChainDocument
|
|
|
|
|
|
class TextSplitter:
|
|
"""
|
|
具有可配置参数的 LangChain 文本分割器包装器。
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
|
|
"""
|
|
初始化文本分割器。
|
|
|
|
Args:
|
|
chunk_size: 文本块的最大大小
|
|
chunk_overlap: 连续块之间的重叠
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.chunk_overlap = chunk_overlap
|
|
self.splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=chunk_overlap,
|
|
length_function=len,
|
|
separators=["\n\n", "\n", " ", ""],
|
|
)
|
|
|
|
def split_text(self, text: str) -> List[str]:
|
|
"""
|
|
将文本分割成块。
|
|
|
|
Args:
|
|
text: 要分割的文本
|
|
|
|
Returns:
|
|
List[str]: 文本块列表
|
|
"""
|
|
return self.splitter.split_text(text)
|
|
|
|
def split_documents(self, documents: List[LangChainDocument]) -> List[LangChainDocument]:
|
|
"""
|
|
将文档分割成块。
|
|
|
|
Args:
|
|
documents: LangChain 文档列表
|
|
|
|
Returns:
|
|
List[LangChainDocument]: 分块文档列表
|
|
"""
|
|
return self.splitter.split_documents(documents)
|
|
|
|
def create_documents(
|
|
self,
|
|
texts: List[str],
|
|
metadatas: List[dict] = None,
|
|
) -> List[LangChainDocument]:
|
|
"""
|
|
从文本创建 LangChain 文档并分割它们。
|
|
|
|
Args:
|
|
texts: 文本列表
|
|
metadatas: 可选的元数据字典列表
|
|
|
|
Returns:
|
|
List[LangChainDocument]: 分块文档列表
|
|
"""
|
|
return self.splitter.create_documents(texts, metadatas=metadatas)
|