Skip to content

Commit

Permalink
Merge pull request #560 from AdiKalra/main
Browse files Browse the repository at this point in the history
Add custom chunking function.
  • Loading branch information
LarFii authored Jan 9, 2025
2 parents 92b7e33 + acde4ed commit a65f002
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
7 changes: 6 additions & 1 deletion lightrag/lightrag.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ class LightRAG:
# Add new field for document status storage type
doc_status_storage: str = field(default="JsonDocStatusStorage")

# Custom Chunking Function
chunking_func: callable = chunking_by_token_size
chunking_func_kwargs: dict = field(default_factory=dict)

def __post_init__(self):
log_file = os.path.join("lightrag.log")
set_logger(log_file)
Expand Down Expand Up @@ -388,13 +392,14 @@ async def ainsert(
**dp,
"full_doc_id": doc_id,
}
for dp in chunking_by_token_size(
for dp in self.chunking_func(
doc["content"],
split_by_character=split_by_character,
split_by_character_only=split_by_character_only,
overlap_token_size=self.chunk_overlap_token_size,
max_token_size=self.chunk_token_size,
tiktoken_model=self.tiktoken_model_name,
**self.chunking_func_kwargs,
)
}

Expand Down
1 change: 1 addition & 0 deletions lightrag/operate.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def chunking_by_token_size(
overlap_token_size=128,
max_token_size=1024,
tiktoken_model="gpt-4o",
**kwargs,
):
tokens = encode_string_by_tiktoken(content, model_name=tiktoken_model)
results = []
Expand Down

0 comments on commit a65f002

Please sign in to comment.