Source code for ragit.loaders

#
# Copyright RODMENA LIMITED 2025
# SPDX-License-Identifier: Apache-2.0
#
"""
Document loading and chunking utilities.

Provides simple functions to load documents from files and chunk text.

Includes ai4rag-inspired patterns:
- Auto-generated document IDs via SHA256 hash
- Sequence numbering for chunk ordering
- Deduplication via content hashing
"""

import hashlib
import re
from pathlib import Path

from ragit.core.experiment.experiment import Chunk, Document


def generate_document_id(content: str) -> str:
    """
    Generate a unique document ID from content using SHA256 hash.

    Pattern from ai4rag langchain_chunker.py.

    Parameters
    ----------
    content : str
        Document content to hash.

    Returns
    -------
    str
        16-character hex string (first 64 bits of SHA256).

    Examples
    --------
    >>> doc_id = generate_document_id("Hello, world!")
    >>> len(doc_id)
    16
    """
    return hashlib.sha256(content.encode()).hexdigest()[:16]


def deduplicate_documents(documents: list[Document]) -> list[Document]:
    """
    Remove duplicate documents based on content hash.

    Pattern from ai4rag chroma.py.

    Parameters
    ----------
    documents : list[Document]
        Documents to deduplicate.

    Returns
    -------
    list[Document]
        Unique documents (first occurrence kept).

    Examples
    --------
    >>> unique_docs = deduplicate_documents(docs)
    >>> print(f"Removed {len(docs) - len(unique_docs)} duplicates")
    """
    seen_hashes: set[str] = set()
    unique_docs: list[Document] = []

    for doc in documents:
        content_hash = generate_document_id(doc.content)
        if content_hash not in seen_hashes:
            seen_hashes.add(content_hash)
            unique_docs.append(doc)

    return unique_docs


[docs] def load_text(path: str | Path) -> Document: """ Load a single text file as a Document. Parameters ---------- path : str or Path Path to the text file (.txt, .md, .rst, etc.) Returns ------- Document Document with file content and metadata. Examples -------- >>> doc = load_text("docs/tutorial.rst") >>> print(doc.id, len(doc.content)) """ path = Path(path) content = path.read_text(encoding="utf-8") return Document(id=path.stem, content=content, metadata={"source": str(path), "filename": path.name})
[docs] def load_directory(path: str | Path, pattern: str = "*.txt", recursive: bool = False) -> list[Document]: """ Load all matching files from a directory as Documents. Parameters ---------- path : str or Path Directory path. pattern : str Glob pattern for files (default: "*.txt"). recursive : bool If True, search recursively (default: False). Returns ------- list[Document] List of loaded documents. Examples -------- >>> docs = load_directory("docs/", "*.rst") >>> docs = load_directory("docs/", "**/*.md", recursive=True) """ path = Path(path) glob_method = path.rglob if recursive else path.glob documents = [] for file_path in sorted(glob_method(pattern)): if file_path.is_file(): documents.append(load_text(file_path)) return documents
[docs] def chunk_text( text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id: str | None = None, include_metadata: bool = True, ) -> list[Chunk]: """ Split text into overlapping chunks with rich metadata. Includes ai4rag-inspired metadata: - document_id: SHA256 hash for deduplication and window search - sequence_number: Order within the document - chunk_start/chunk_end: Character positions in original text Parameters ---------- text : str Text to chunk. chunk_size : int Maximum characters per chunk (default: 512). chunk_overlap : int Overlap between chunks (default: 50). doc_id : str, optional Document ID for the chunks. If None, generates from content hash. include_metadata : bool Include rich metadata in chunks (default: True). Returns ------- list[Chunk] List of text chunks with metadata. Examples -------- >>> chunks = chunk_text("Long document...", chunk_size=256) >>> print(chunks[0].metadata) {'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256} """ if chunk_overlap >= chunk_size: raise ValueError("chunk_overlap must be less than chunk_size") # Generate document ID if not provided effective_doc_id = doc_id or generate_document_id(text) chunks = [] start = 0 sequence_number = 0 while start < len(text): end = min(start + chunk_size, len(text)) chunk_content = text[start:end].strip() if chunk_content: metadata = {} if include_metadata: metadata = { "document_id": effective_doc_id, "sequence_number": sequence_number, "chunk_start": start, "chunk_end": end, } chunks.append( Chunk( content=chunk_content, doc_id=effective_doc_id, chunk_index=sequence_number, metadata=metadata, ) ) sequence_number += 1 start = end - chunk_overlap if start >= len(text) - chunk_overlap: break return chunks
[docs] def chunk_document( doc: Document, chunk_size: int = 512, chunk_overlap: int = 50, include_metadata: bool = True, ) -> list[Chunk]: """ Split a Document into overlapping chunks with rich metadata. Parameters ---------- doc : Document Document to chunk. chunk_size : int Maximum characters per chunk. chunk_overlap : int Overlap between chunks. include_metadata : bool Include rich metadata in chunks (default: True). Returns ------- list[Chunk] List of chunks from the document with metadata. """ chunks = chunk_text(doc.content, chunk_size, chunk_overlap, doc.id, include_metadata) # Merge document metadata into chunk metadata if doc.metadata and include_metadata: for chunk in chunks: chunk.metadata = {**doc.metadata, **chunk.metadata} return chunks
[docs] def chunk_by_separator( text: str, separator: str = "\n\n", doc_id: str | None = None, include_metadata: bool = True, ) -> list[Chunk]: """ Split text by a separator (e.g., paragraphs, sections). Parameters ---------- text : str Text to split. separator : str Separator string (default: double newline for paragraphs). doc_id : str, optional Document ID for the chunks. If None, generates from content hash. include_metadata : bool Include rich metadata in chunks (default: True). Returns ------- list[Chunk] List of chunks with metadata. Examples -------- >>> chunks = chunk_by_separator(text, separator="\\n---\\n") """ effective_doc_id = doc_id or generate_document_id(text) parts = text.split(separator) chunks: list[Chunk] = [] current_pos = 0 for _idx, part in enumerate(parts): content = part.strip() if content: metadata = {} if include_metadata: # Find actual position in original text part_start = text.find(part, current_pos) part_end = part_start + len(part) if part_start >= 0 else current_pos + len(part) metadata = { "document_id": effective_doc_id, "sequence_number": len(chunks), "chunk_start": part_start if part_start >= 0 else current_pos, "chunk_end": part_end, } current_pos = part_end chunks.append( Chunk( content=content, doc_id=effective_doc_id, chunk_index=len(chunks), metadata=metadata, ) ) return chunks
[docs] def chunk_rst_sections( text: str, doc_id: str | None = None, include_metadata: bool = True, ) -> list[Chunk]: """ Split RST document by section headers with rich metadata. Parameters ---------- text : str RST document text. doc_id : str, optional Document ID for the chunks. If None, generates from content hash. include_metadata : bool Include rich metadata in chunks (default: True). Returns ------- list[Chunk] List of section chunks with metadata. """ effective_doc_id = doc_id or generate_document_id(text) # Match RST section headers (title followed by underline of =, -, ~, etc.) pattern = r"\n([^\n]+)\n([=\-~`\'\"^_*+#]+)\n" # Find all section positions matches = list(re.finditer(pattern, text)) if not matches: # No sections found, return whole text as one chunk if text.strip(): metadata = {} if include_metadata: metadata = { "document_id": effective_doc_id, "sequence_number": 0, "chunk_start": 0, "chunk_end": len(text), } return [Chunk(content=text.strip(), doc_id=effective_doc_id, chunk_index=0, metadata=metadata)] return [] chunks = [] # Handle content before first section first_pos = matches[0].start() if first_pos > 0: pre_content = text[:first_pos].strip() if pre_content: metadata = {} if include_metadata: metadata = { "document_id": effective_doc_id, "sequence_number": 0, "chunk_start": 0, "chunk_end": first_pos, } chunks.append(Chunk(content=pre_content, doc_id=effective_doc_id, chunk_index=0, metadata=metadata)) # Extract each section for i, match in enumerate(matches): start = match.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) section_content = text[start:end].strip() if section_content: metadata = {} if include_metadata: metadata = { "document_id": effective_doc_id, "sequence_number": len(chunks), "chunk_start": start, "chunk_end": end, } chunks.append( Chunk( content=section_content, doc_id=effective_doc_id, chunk_index=len(chunks), metadata=metadata, ) ) return chunks