Skip to content

Document Search#

ragbits.document_search.DocumentSearch #

DocumentSearch(embedder: Embeddings, vector_store: VectorStore, query_rephraser: QueryRephraser | None = None, reranker: Reranker | None = None, document_processor_router: DocumentProcessorRouter | None = None, processing_strategy: ProcessingExecutionStrategy | None = None)

A main entrypoint to the DocumentSearch functionality.

It provides methods for both ingestion and retrieval.

Retrieval:

1. Uses QueryRephraser to rephrase the query.
2. Uses VectorStore to retrieve the most relevant chunks.
3. Uses Reranker to rerank the chunks.
Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
def __init__(
    self,
    embedder: Embeddings,
    vector_store: VectorStore,
    query_rephraser: QueryRephraser | None = None,
    reranker: Reranker | None = None,
    document_processor_router: DocumentProcessorRouter | None = None,
    processing_strategy: ProcessingExecutionStrategy | None = None,
) -> None:
    self.embedder = embedder
    self.vector_store = vector_store
    self.query_rephraser = query_rephraser or NoopQueryRephraser()
    self.reranker = reranker or NoopReranker()
    self.document_processor_router = document_processor_router or DocumentProcessorRouter.from_config()
    self.processing_strategy = processing_strategy or SequentialProcessing()

embedder instance-attribute #

embedder: Embeddings = embedder

vector_store instance-attribute #

vector_store: VectorStore = vector_store

query_rephraser instance-attribute #

query_rephraser: QueryRephraser = query_rephraser or NoopQueryRephraser()

reranker instance-attribute #

reranker: Reranker = reranker or NoopReranker()

document_processor_router instance-attribute #

processing_strategy instance-attribute #

from_config classmethod #

from_config(config: dict) -> DocumentSearch

Creates and returns an instance of the DocumentSearch class from the given configuration.

PARAMETER DESCRIPTION
config

A configuration object containing the configuration for initializing the DocumentSearch instance.

TYPE: dict

RETURNS DESCRIPTION
DocumentSearch

An initialized instance of the DocumentSearch class.

TYPE: DocumentSearch

RAISES DESCRIPTION
ValidationError

If the configuration doesn't follow the expected format.

InvalidConfigError

If one of the specified classes can't be found or is not the correct type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@classmethod
def from_config(cls, config: dict) -> "DocumentSearch":
    """
    Creates and returns an instance of the DocumentSearch class from the given configuration.

    Args:
        config: A configuration object containing the configuration for initializing the DocumentSearch instance.

    Returns:
        DocumentSearch: An initialized instance of the DocumentSearch class.

    Raises:
        ValidationError: If the configuration doesn't follow the expected format.
        InvalidConfigError: If one of the specified classes can't be found or is not the correct type.
    """
    model = DocumentSearchConfig.model_validate(config)

    embedder: Embeddings = Embeddings.subclass_from_config(model.embedder)
    query_rephraser = QueryRephraser.subclass_from_config(model.rephraser)
    reranker: Reranker = Reranker.subclass_from_config(model.reranker)
    vector_store: VectorStore = VectorStore.subclass_from_config(model.vector_store)
    processing_strategy = ProcessingExecutionStrategy.subclass_from_config(model.processing_strategy)

    providers_config = DocumentProcessorRouter.from_dict_to_providers_config(model.providers)
    document_processor_router = DocumentProcessorRouter.from_config(providers_config)

    return cls(embedder, vector_store, query_rephraser, reranker, document_processor_router, processing_strategy)

search async #

search(query: str, config: SearchConfig | None = None) -> Sequence[Element]

Search for the most relevant chunks for a query.

PARAMETER DESCRIPTION
query

The query to search for.

TYPE: str

config

The search configuration.

TYPE: SearchConfig | None DEFAULT: None

RETURNS DESCRIPTION
Sequence[Element]

A list of chunks.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@traceable
async def search(self, query: str, config: SearchConfig | None = None) -> Sequence[Element]:
    """
    Search for the most relevant chunks for a query.

    Args:
        query: The query to search for.
        config: The search configuration.

    Returns:
        A list of chunks.
    """
    config = config or SearchConfig()
    queries = await self.query_rephraser.rephrase(query)
    elements = []
    for rephrased_query in queries:
        search_vector = await self.embedder.embed_text([rephrased_query])
        entries = await self.vector_store.retrieve(
            vector=search_vector[0],
            options=VectorStoreOptions(**config.vector_store_kwargs),
        )
        elements.extend([Element.from_vector_db_entry(entry) for entry in entries])

    return await self.reranker.rerank(
        elements=elements,
        query=query,
        options=RerankerOptions(**config.reranker_kwargs),
    )

ingest async #

ingest(documents: Sequence[DocumentMeta | Document | Source], document_processor: BaseProvider | None = None) -> None

Ingest multiple documents.

PARAMETER DESCRIPTION
documents

The documents or metadata of the documents to ingest.

TYPE: Sequence[DocumentMeta | Document | Source]

document_processor

The document processor to use. If not provided, the document processor will be determined based on the document metadata.

TYPE: BaseProvider | None DEFAULT: None

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@traceable
async def ingest(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    document_processor: BaseProvider | None = None,
) -> None:
    """
    Ingest multiple documents.

    Args:
        documents: The documents or metadata of the documents to ingest.
        document_processor: The document processor to use. If not provided, the document processor will be
            determined based on the document metadata.
    """
    elements = await self.processing_strategy.process_documents(
        documents, self.document_processor_router, document_processor
    )
    await self._remove_entries_with_same_sources(elements)
    await self.insert_elements(elements)

insert_elements async #

insert_elements(elements: list[Element]) -> None

Insert Elements into the vector store.

PARAMETER DESCRIPTION
elements

The list of Elements to insert.

TYPE: list[Element]

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
async def insert_elements(self, elements: list[Element]) -> None:
    """
    Insert Elements into the vector store.

    Args:
        elements: The list of Elements to insert.
    """
    elements_with_text = [element for element in elements if element.key]
    images_with_text = [element for element in elements_with_text if isinstance(element, ImageElement)]
    vectors = await self.embedder.embed_text([element.key for element in elements_with_text])  # type: ignore

    image_elements = [element for element in elements if isinstance(element, ImageElement)]

    entries = [
        element.to_vector_db_entry(vector, EmbeddingType.TEXT)
        for element, vector in zip(elements_with_text, vectors, strict=False)
    ]
    not_embedded_image_elements = [
        image_element for image_element in image_elements if image_element not in images_with_text
    ]

    if image_elements and self.embedder.image_support():
        image_vectors = await self.embedder.embed_image([element.image_bytes for element in image_elements])
        entries.extend(
            [
                element.to_vector_db_entry(vector, EmbeddingType.IMAGE)
                for element, vector in zip(image_elements, image_vectors, strict=False)
            ]
        )
        not_embedded_image_elements = []

    for image_element in not_embedded_image_elements:
        warnings.warn(f"Image: {image_element.id} could not be embedded")

    await self.vector_store.store(entries)