#
# Copyright RODMENA LIMITED 2025
# SPDX-License-Identifier: Apache-2.0
#
"""
Document loading and chunking utilities.
Provides simple functions to load documents from files and chunk text.
Includes ai4rag-inspired patterns:
- Auto-generated document IDs via SHA256 hash
- Sequence numbering for chunk ordering
- Deduplication via content hashing
"""
import hashlib
import re
from pathlib import Path
from ragit.core.experiment.experiment import Chunk, Document
def generate_document_id(content: str) -> str:
"""
Generate a unique document ID from content using SHA256 hash.
Pattern from ai4rag langchain_chunker.py.
Parameters
----------
content : str
Document content to hash.
Returns
-------
str
16-character hex string (first 64 bits of SHA256).
Examples
--------
>>> doc_id = generate_document_id("Hello, world!")
>>> len(doc_id)
16
"""
return hashlib.sha256(content.encode()).hexdigest()[:16]
def deduplicate_documents(documents: list[Document]) -> list[Document]:
"""
Remove duplicate documents based on content hash.
Pattern from ai4rag chroma.py.
Parameters
----------
documents : list[Document]
Documents to deduplicate.
Returns
-------
list[Document]
Unique documents (first occurrence kept).
Examples
--------
>>> unique_docs = deduplicate_documents(docs)
>>> print(f"Removed {len(docs) - len(unique_docs)} duplicates")
"""
seen_hashes: set[str] = set()
unique_docs: list[Document] = []
for doc in documents:
content_hash = generate_document_id(doc.content)
if content_hash not in seen_hashes:
seen_hashes.add(content_hash)
unique_docs.append(doc)
return unique_docs
[docs]
def load_text(path: str | Path) -> Document:
"""
Load a single text file as a Document.
Parameters
----------
path : str or Path
Path to the text file (.txt, .md, .rst, etc.)
Returns
-------
Document
Document with file content and metadata.
Examples
--------
>>> doc = load_text("docs/tutorial.rst")
>>> print(doc.id, len(doc.content))
"""
path = Path(path)
content = path.read_text(encoding="utf-8")
return Document(id=path.stem, content=content, metadata={"source": str(path), "filename": path.name})
[docs]
def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = False) -> list[Document]:
"""
Load all matching files from a directory as Documents.
Parameters
----------
path : str or Path
Directory path.
pattern : str
Glob pattern for files (default: "*.txt").
recursive : bool
If True, search recursively (default: False).
Returns
-------
list[Document]
List of loaded documents.
Examples
--------
>>> docs = load_directory("docs/", "*.rst")
>>> docs = load_directory("docs/", "**/*.md", recursive=True)
"""
path = Path(path)
glob_method = path.rglob if recursive else path.glob
documents = []
for file_path in sorted(glob_method(pattern)):
if file_path.is_file():
documents.append(load_text(file_path))
return documents
[docs]
def chunk_text(
text: str,
chunk_size: int = 512,
chunk_overlap: int = 50,
doc_id: str | None = None,
include_metadata: bool = True,
) -> list[Chunk]:
"""
Split text into overlapping chunks with rich metadata.
Includes ai4rag-inspired metadata:
- document_id: SHA256 hash for deduplication and window search
- sequence_number: Order within the document
- chunk_start/chunk_end: Character positions in original text
Parameters
----------
text : str
Text to chunk.
chunk_size : int
Maximum characters per chunk (default: 512).
chunk_overlap : int
Overlap between chunks (default: 50).
doc_id : str, optional
Document ID for the chunks. If None, generates from content hash.
include_metadata : bool
Include rich metadata in chunks (default: True).
Returns
-------
list[Chunk]
List of text chunks with metadata.
Examples
--------
>>> chunks = chunk_text("Long document...", chunk_size=256)
>>> print(chunks[0].metadata)
{'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256}
"""
if chunk_overlap >= chunk_size:
raise ValueError("chunk_overlap must be less than chunk_size")
# Generate document ID if not provided
effective_doc_id = doc_id or generate_document_id(text)
chunks = []
start = 0
sequence_number = 0
while start < len(text):
end = min(start + chunk_size, len(text))
chunk_content = text[start:end].strip()
if chunk_content:
metadata = {}
if include_metadata:
metadata = {
"document_id": effective_doc_id,
"sequence_number": sequence_number,
"chunk_start": start,
"chunk_end": end,
}
chunks.append(
Chunk(
content=chunk_content,
doc_id=effective_doc_id,
chunk_index=sequence_number,
metadata=metadata,
)
)
sequence_number += 1
start = end - chunk_overlap
if start >= len(text) - chunk_overlap:
break
return chunks
[docs]
def chunk_document(
doc: Document,
chunk_size: int = 512,
chunk_overlap: int = 50,
include_metadata: bool = True,
) -> list[Chunk]:
"""
Split a Document into overlapping chunks with rich metadata.
Parameters
----------
doc : Document
Document to chunk.
chunk_size : int
Maximum characters per chunk.
chunk_overlap : int
Overlap between chunks.
include_metadata : bool
Include rich metadata in chunks (default: True).
Returns
-------
list[Chunk]
List of chunks from the document with metadata.
"""
chunks = chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, include_metadata)
# Merge document metadata into chunk metadata
if doc.metadata and include_metadata:
for chunk in chunks:
chunk.metadata = {**doc.metadata, **chunk.metadata}
return chunks
[docs]
def chunk_by_separator(
text: str,
separator: str = "\n\n",
doc_id: str | None = None,
include_metadata: bool = True,
) -> list[Chunk]:
"""
Split text by a separator (e.g., paragraphs, sections).
Parameters
----------
text : str
Text to split.
separator : str
Separator string (default: double newline for paragraphs).
doc_id : str, optional
Document ID for the chunks. If None, generates from content hash.
include_metadata : bool
Include rich metadata in chunks (default: True).
Returns
-------
list[Chunk]
List of chunks with metadata.
Examples
--------
>>> chunks = chunk_by_separator(text, separator="\\n---\\n")
"""
effective_doc_id = doc_id or generate_document_id(text)
parts = text.split(separator)
chunks: list[Chunk] = []
current_pos = 0
for _idx, part in enumerate(parts):
content = part.strip()
if content:
metadata = {}
if include_metadata:
# Find actual position in original text
part_start = text.find(part, current_pos)
part_end = part_start + len(part) if part_start >= 0 else current_pos + len(part)
metadata = {
"document_id": effective_doc_id,
"sequence_number": len(chunks),
"chunk_start": part_start if part_start >= 0 else current_pos,
"chunk_end": part_end,
}
current_pos = part_end
chunks.append(
Chunk(
content=content,
doc_id=effective_doc_id,
chunk_index=len(chunks),
metadata=metadata,
)
)
return chunks
[docs]
def chunk_rst_sections(
text: str,
doc_id: str | None = None,
include_metadata: bool = True,
) -> list[Chunk]:
"""
Split RST document by section headers with rich metadata.
Parameters
----------
text : str
RST document text.
doc_id : str, optional
Document ID for the chunks. If None, generates from content hash.
include_metadata : bool
Include rich metadata in chunks (default: True).
Returns
-------
list[Chunk]
List of section chunks with metadata.
"""
effective_doc_id = doc_id or generate_document_id(text)
# Match RST section headers (title followed by underline of =, -, ~, etc.)
pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n"
# Find all section positions
matches = list(re.finditer(pattern, text))
if not matches:
# No sections found, return whole text as one chunk
if text.strip():
metadata = {}
if include_metadata:
metadata = {
"document_id": effective_doc_id,
"sequence_number": 0,
"chunk_start": 0,
"chunk_end": len(text),
}
return [Chunk(content=text.strip(), doc_id=effective_doc_id, chunk_index=0, metadata=metadata)]
return []
chunks = []
# Handle content before first section
first_pos = matches[0].start()
if first_pos > 0:
pre_content = text[:first_pos].strip()
if pre_content:
metadata = {}
if include_metadata:
metadata = {
"document_id": effective_doc_id,
"sequence_number": 0,
"chunk_start": 0,
"chunk_end": first_pos,
}
chunks.append(Chunk(content=pre_content, doc_id=effective_doc_id, chunk_index=0, metadata=metadata))
# Extract each section
for i, match in enumerate(matches):
start = match.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
section_content = text[start:end].strip()
if section_content:
metadata = {}
if include_metadata:
metadata = {
"document_id": effective_doc_id,
"sequence_number": len(chunks),
"chunk_start": start,
"chunk_end": end,
}
chunks.append(
Chunk(
content=section_content,
doc_id=effective_doc_id,
chunk_index=len(chunks),
metadata=metadata,
)
)
return chunks