diff --git a/lightrag/lightrag.py b/lightrag/lightrag.py index 9905ee74..596fbdbf 100644 --- a/lightrag/lightrag.py +++ b/lightrag/lightrag.py @@ -187,6 +187,10 @@ class LightRAG: # Add new field for document status storage type doc_status_storage: str = field(default="JsonDocStatusStorage") + # Custom Chunking Function + chunking_func: callable = chunking_by_token_size + chunking_func_kwargs: dict = field(default_factory=dict) + def __post_init__(self): log_file = os.path.join("lightrag.log") set_logger(log_file) @@ -388,13 +392,14 @@ async def ainsert( **dp, "full_doc_id": doc_id, } - for dp in chunking_by_token_size( + for dp in self.chunking_func( doc["content"], split_by_character=split_by_character, split_by_character_only=split_by_character_only, overlap_token_size=self.chunk_overlap_token_size, max_token_size=self.chunk_token_size, tiktoken_model=self.tiktoken_model_name, + **self.chunking_func_kwargs, ) } diff --git a/lightrag/operate.py b/lightrag/operate.py index 09871659..7216c07f 100644 --- a/lightrag/operate.py +++ b/lightrag/operate.py @@ -39,6 +39,7 @@ def chunking_by_token_size( overlap_token_size=128, max_token_size=1024, tiktoken_model="gpt-4o", + **kwargs, ): tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model) results = []