Loaders API

The loaders module provides utilities for loading documents and splitting them into chunks.

Document Loading

load_text()

Load a single text file as a Document.

ragit.load_text(path: str | Path) → Document[source]

Load a single text file as a Document.

Parameters:: path (str or Path) – Path to the text file (.txt, .md, .rst, etc.)
Returns:: Document with file content and metadata.
Return type:: Document

Examples

>>> doc = load_text("docs/tutorial.rst")
>>> print(doc.id, len(doc.content))

from ragit import load_text

# Load a single file
doc = load_text("README.md")

print(f"ID: {doc.id}")           # "README" (filename without extension)
print(f"Length: {len(doc.content)}")
print(f"Metadata: {doc.metadata}")
# {"source": "README.md", "filename": "README.md"}

load_directory()

Load all matching files from a directory.

ragit.load_directory(path: str | Path, pattern: str = '*.txt', recursive: bool = False) → list[Document][source]

Load all matching files from a directory as Documents.

Parameters:

path (str or Path) – Directory path.
pattern (str) – Glob pattern for files (default: “*.txt”).
recursive (bool) – If True, search recursively (default: False).

Returns:

List of loaded documents.

Return type:

list[Document]

Examples

>>> docs = load_directory("docs/", "*.rst")
>>> docs = load_directory("docs/", "**/*.md", recursive=True)

from ragit import load_directory

# Load all .txt files
docs = load_directory("docs/", pattern="*.txt")
print(f"Loaded {len(docs)} documents")

# Load all .rst files
docs = load_directory("docs/", pattern="*.rst")

# Load recursively
docs = load_directory("project/", pattern="**/*.md", recursive=True)

# Process loaded documents
for doc in docs:
    print(f"  {doc.id}: {len(doc.content)} chars")

Text Chunking

chunk_text()

Split text into overlapping chunks.

ragit.chunk_text(text: str, chunk_size: int = 512, chunk_overlap: int = 50, doc_id: str | None = None, include_metadata: bool = True) → list[Chunk][source]

Split text into overlapping chunks with rich metadata.

Includes ai4rag-inspired metadata: - document_id: SHA256 hash for deduplication and window search - sequence_number: Order within the document - chunk_start/chunk_end: Character positions in original text

Parameters:

text (str) – Text to chunk.
chunk_size (int) – Maximum characters per chunk (default: 512).
chunk_overlap (int) – Overlap between chunks (default: 50).
doc_id (str, optional) – Document ID for the chunks. If None, generates from content hash.
include_metadata (bool) – Include rich metadata in chunks (default: True).

Returns:

List of text chunks with metadata.

Return type:

list[Chunk]

Examples

>>> chunks = chunk_text("Long document...", chunk_size=256)
>>> print(chunks[0].metadata)
{'document_id': 'a1b2c3...', 'sequence_number': 0, 'chunk_start': 0, 'chunk_end': 256}

from ragit import chunk_text

text = """
This is a long document that needs to be split into smaller
chunks for efficient retrieval. Each chunk should contain
enough context to be meaningful on its own, while also
overlapping with adjacent chunks to preserve continuity.
"""

# Basic chunking
chunks = chunk_text(text, chunk_size=100, chunk_overlap=20)
print(f"Created {len(chunks)} chunks")

for chunk in chunks:
    print(f"Chunk {chunk.chunk_index}: {len(chunk.content)} chars")
    print(f"  Content: {chunk.content[:50]}...")

# Custom document ID
chunks = chunk_text(
    text,
    chunk_size=256,
    chunk_overlap=50,
    doc_id="my_document"
)

chunk_document()

Split a Document into overlapping chunks.

ragit.chunk_document(doc: Document, chunk_size: int = 512, chunk_overlap: int = 50, include_metadata: bool = True) → list[Chunk][source]

Split a Document into overlapping chunks with rich metadata.

Parameters:

doc (Document) – Document to chunk.
chunk_size (int) – Maximum characters per chunk.
chunk_overlap (int) – Overlap between chunks.
include_metadata (bool) – Include rich metadata in chunks (default: True).

Returns:

List of chunks from the document with metadata.

Return type:

list[Chunk]

from ragit import load_text, chunk_document

# Load and chunk a document
doc = load_text("guide.txt")
chunks = chunk_document(doc, chunk_size=512, chunk_overlap=50)

print(f"Document '{doc.id}' split into {len(chunks)} chunks")

for chunk in chunks:
    print(f"  Chunk {chunk.chunk_index}: {chunk.content[:50]}...")

chunk_by_separator()

Split text by a separator (e.g., paragraphs, sections).

ragit.chunk_by_separator(text: str, separator: str = '\n\n', doc_id: str | None = None, include_metadata: bool = True) → list[Chunk][source]

Split text by a separator (e.g., paragraphs, sections).

Parameters:

text (str) – Text to split.
separator (str) – Separator string (default: double newline for paragraphs).
doc_id (str, optional) – Document ID for the chunks. If None, generates from content hash.
include_metadata (bool) – Include rich metadata in chunks (default: True).

Returns:

List of chunks with metadata.

Return type:

list[Chunk]

Examples

>>> chunks = chunk_by_separator(text, separator="\n---\n")

from ragit import chunk_by_separator

text = """
First paragraph with some content.

Second paragraph with different content.

Third paragraph with more information.
"""

# Split by double newline (paragraphs)
chunks = chunk_by_separator(text, separator="\n\n", doc_id="article")

for chunk in chunks:
    print(f"Paragraph {chunk.chunk_index}: {chunk.content}")

# Split by custom separator
markdown = """
# Section 1
Content for section 1.

---

# Section 2
Content for section 2.
"""

sections = chunk_by_separator(markdown, separator="\n---\n", doc_id="readme")

chunk_rst_sections()

Split RST documents by section headers.

ragit.chunk_rst_sections(text: str, doc_id: str | None = None, include_metadata: bool = True) → list[Chunk][source]

Split RST document by section headers with rich metadata.

Parameters:

text (str) – RST document text.
doc_id (str, optional) – Document ID for the chunks. If None, generates from content hash.
include_metadata (bool) – Include rich metadata in chunks (default: True).

Returns:

List of section chunks with metadata.

Return type:

list[Chunk]

from ragit import chunk_rst_sections

rst_content = """
Introduction
============

This is the introduction section.

Installation
------------

How to install the software.

Usage
-----

How to use the software.
"""

chunks = chunk_rst_sections(rst_content, doc_id="manual")

for chunk in chunks:
    print(f"Section {chunk.chunk_index}:")
    print(f"  {chunk.content[:100]}...")

Data Classes

Chunk

class ragit.core.experiment.experiment.Chunk(content: str, doc_id: str, chunk_index: int, embedding: tuple[float, ...] | list[float] | None=None, metadata: dict[str, ~typing.Any]=<factory>)[source]

A document chunk with optional rich metadata.

Metadata can include: - document_id: SHA256 hash for deduplication and window search - sequence_number: Order within the document - chunk_start/chunk_end: Character positions in original text

content: str

doc_id: str

chunk_index: int

embedding: tuple[float, ...] | list[float] | None = None

metadata: dict[str, Any]

__init__(content: str, doc_id: str, chunk_index: int, embedding: tuple[float, ...] | list[float] | None=None, metadata: dict[str, ~typing.Any]=<factory>) → None

from ragit.core.experiment.experiment import Chunk

chunk = Chunk(
    content="The actual text content of the chunk",
    doc_id="document_name",
    chunk_index=0,
    embedding=None  # Set after embedding
)

print(chunk.content)
print(chunk.doc_id)
print(chunk.chunk_index)

Document

class ragit.Document(id: str, content: str, metadata: dict[str, ~typing.Any]=<factory>)[source]

A document in the knowledge base.

id: str

content: str

metadata: dict[str, Any]

__init__(id: str, content: str, metadata: dict[str, ~typing.Any]=<factory>) → None

from ragit import Document

doc = Document(
    id="readme",
    content="Full document content here...",
    metadata={
        "source": "README.md",
        "version": "1.0",
        "author": "John Doe"
    }
)

Complete Examples

Loading and Chunking a Project

from ragit import load_directory, chunk_document

# Load all Python files
docs = load_directory("src/", pattern="**/*.py", recursive=True)
print(f"Loaded {len(docs)} Python files")

# Chunk all documents
all_chunks = []
for doc in docs:
    chunks = chunk_document(doc, chunk_size=512, chunk_overlap=50)
    all_chunks.extend(chunks)
    print(f"  {doc.id}: {len(chunks)} chunks")

print(f"\nTotal chunks: {len(all_chunks)}")

Processing Different File Types

from ragit import load_text, chunk_document, chunk_rst_sections

def load_and_chunk(path: str) -> list:
    """Load and chunk a file based on its type."""
    doc = load_text(path)

    if path.endswith(".rst"):
        # Use section-based chunking for RST
        return chunk_rst_sections(doc.content, doc_id=doc.id)
    else:
        # Use overlap chunking for other formats
        return chunk_document(doc, chunk_size=512, chunk_overlap=50)

# Usage
rst_chunks = load_and_chunk("docs/guide.rst")
md_chunks = load_and_chunk("README.md")
txt_chunks = load_and_chunk("notes.txt")

Building a Document Index

from ragit import load_directory, chunk_document
from ragit.providers import OllamaProvider

# Load documents
docs = load_directory("knowledge_base/", pattern="*.txt")

# Chunk all documents
all_chunks = []
for doc in docs:
    chunks = chunk_document(doc, chunk_size=512, chunk_overlap=50)
    all_chunks.extend(chunks)

# Create embeddings
provider = OllamaProvider()
texts = [chunk.content for chunk in all_chunks]
embeddings = provider.embed_batch(texts, model="mxbai-embed-large")

# Build index (chunks with embeddings)
indexed_chunks = [
    {
        "content": chunk.content,
        "doc_id": chunk.doc_id,
        "chunk_index": chunk.chunk_index,
        "embedding": emb.embedding
    }
    for chunk, emb in zip(all_chunks, embeddings, strict=True)
]

print(f"Indexed {len(indexed_chunks)} chunks")