Skip to content

Documents and Elements#

ragbits.document_search.documents.document.Document #

Bases: BaseModel

An object representing a document which is downloaded and stored locally.

local_path instance-attribute #

local_path: Path

metadata instance-attribute #

metadata: DocumentMeta

from_document_meta classmethod #

from_document_meta(document_meta: DocumentMeta, local_path: Path) -> Document

Create a document from a document metadata. Based on the document type, it will return a different object.

PARAMETER DESCRIPTION
document_meta

The document metadata.

TYPE: DocumentMeta

local_path

The local path to the document.

TYPE: Path

RETURNS DESCRIPTION
Document

The document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py
@classmethod
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
    """
    Create a document from a document metadata.
    Based on the document type, it will return a different object.

    Args:
        document_meta: The document metadata.
        local_path: The local path to the document.

    Returns:
        The document.
    """
    if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
        return TextDocument(local_path=local_path, metadata=document_meta)
    return cls(local_path=local_path, metadata=document_meta)

ragbits.document_search.documents.document.DocumentType #

Bases: str, Enum

Types of documents that can be stored.

MD class-attribute instance-attribute #

MD = 'md'

TXT class-attribute instance-attribute #

TXT = 'txt'

PDF class-attribute instance-attribute #

PDF = 'pdf'

CSV class-attribute instance-attribute #

CSV = 'csv'

DOC class-attribute instance-attribute #

DOC = 'doc'

DOCX class-attribute instance-attribute #

DOCX = 'docx'

HTML class-attribute instance-attribute #

HTML = 'html'

EPUB class-attribute instance-attribute #

EPUB = 'epub'

XLSX class-attribute instance-attribute #

XLSX = 'xlsx'

XLS class-attribute instance-attribute #

XLS = 'xls'

ORG class-attribute instance-attribute #

ORG = 'org'

ODT class-attribute instance-attribute #

ODT = 'odt'

PPT class-attribute instance-attribute #

PPT = 'ppt'

PPTX class-attribute instance-attribute #

PPTX = 'pptx'

RST class-attribute instance-attribute #

RST = 'rst'

RTF class-attribute instance-attribute #

RTF = 'rtf'

TSV class-attribute instance-attribute #

TSV = 'tsv'

XML class-attribute instance-attribute #

XML = 'xml'

JPG class-attribute instance-attribute #

JPG = 'jpg'

PNG class-attribute instance-attribute #

PNG = 'png'

UNKNOWN class-attribute instance-attribute #

UNKNOWN = 'unknown'

ragbits.document_search.documents.element.Element #

Bases: BaseModel, ABC

An object representing an element in a document.

element_type instance-attribute #

element_type: str

document_meta instance-attribute #

document_meta: DocumentMeta

location class-attribute instance-attribute #

location: ElementLocation | None = None

id property #

id: str

Retrieve the ID of the element, primarily used to represent the element's data.

RETURNS DESCRIPTION
str

string representing element

TYPE: str

key property #

key: str | None

Get the representation of the element for embedding.

RETURNS DESCRIPTION
str | None

The representation for embedding.

text_representation abstractmethod property #

text_representation: str | None

Get the text representation of the element.

RETURNS DESCRIPTION
str | None

The text representation.

get_id_components #

get_id_components() -> dict[str, str]

Creates a dictionary of key value pairs of id components

RETURNS DESCRIPTION
dict

a dictionary

TYPE: dict[str, str]

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py
def get_id_components(self) -> dict[str, str]:
    """
    Creates a dictionary of key value pairs of id components

    Returns:
        dict: a dictionary
    """
    id_components = {
        "meta": self.document_meta.id,
        "type": self.element_type,
        "key": str(self.key),
        "text": str(self.text_representation),
        "location": str(self.location),
    }
    return id_components

from_vector_db_entry classmethod #

from_vector_db_entry(db_entry: VectorStoreEntry) -> Element

Create an element from a vector database entry.

PARAMETER DESCRIPTION
db_entry

The vector database entry.

TYPE: VectorStoreEntry

RETURNS DESCRIPTION
Element

The element.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py
@classmethod
def from_vector_db_entry(cls, db_entry: VectorStoreEntry) -> "Element":
    """
    Create an element from a vector database entry.

    Args:
        db_entry: The vector database entry.

    Returns:
        The element.
    """
    element_type = db_entry.metadata["element_type"]
    element_cls = Element._elements_registry[element_type]
    if "embedding_type" in db_entry.metadata:
        del db_entry.metadata["embedding_type"]
    return element_cls(**db_entry.metadata)

to_vector_db_entry #

to_vector_db_entry(vector: list[float], embedding_type: EmbeddingType) -> VectorStoreEntry

Create a vector database entry from the element.

PARAMETER DESCRIPTION
vector

The vector.

TYPE: list[float]

embedding_type

EmbeddingTypes

TYPE: EmbeddingType

Returns: The vector database entry

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py
def to_vector_db_entry(self, vector: list[float], embedding_type: EmbeddingType) -> VectorStoreEntry:
    """
    Create a vector database entry from the element.

    Args:
        vector: The vector.
        embedding_type: EmbeddingTypes
    Returns:
        The vector database entry
    """
    id_components = [
        self.id,
        str(embedding_type),
    ]
    vector_store_entry_id = str(uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components)))
    metadata = self.model_dump(exclude={"id", "key"})
    metadata["embedding_type"] = str(embedding_type)
    metadata["document_meta"]["source"]["id"] = self.document_meta.source.id
    return VectorStoreEntry(id=vector_store_entry_id, key=str(self.key), vector=vector, metadata=metadata)

ragbits.document_search.documents.sources.Source #

Bases: BaseModel, ABC

An object representing a source.

id abstractmethod property #

id: str

Get the source ID.

RETURNS DESCRIPTION
str

The source ID.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch abstractmethod async #

fetch() -> Path

Load the source.

RETURNS DESCRIPTION
Path

The path to the source.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources.py
@abstractmethod
async def fetch(self) -> Path:
    """
    Load the source.

    Returns:
        The path to the source.
    """