Skip to content

Document Search#

ragbits.document_search.DocumentSearch #

DocumentSearch(embedder: Embeddings, vector_store: VectorStore, query_rephraser: QueryRephraser | None = None, reranker: Reranker | None = None, document_processor_router: DocumentProcessorRouter | None = None, processing_strategy: ProcessingExecutionStrategy | None = None)

A main entrypoint to the DocumentSearch functionality.

It provides methods for both ingestion and retrieval.

Retrieval:

1. Uses QueryRephraser to rephrase the query.
2. Uses VectorStore to retrieve the most relevant chunks.
3. Uses Reranker to rerank the chunks.
Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
def __init__(
    self,
    embedder: Embeddings,
    vector_store: VectorStore,
    query_rephraser: QueryRephraser | None = None,
    reranker: Reranker | None = None,
    document_processor_router: DocumentProcessorRouter | None = None,
    processing_strategy: ProcessingExecutionStrategy | None = None,
) -> None:
    self.embedder = embedder
    self.vector_store = vector_store
    self.query_rephraser = query_rephraser or NoopQueryRephraser()
    self.reranker = reranker or NoopReranker()
    self.document_processor_router = document_processor_router or DocumentProcessorRouter.from_config()
    self.processing_strategy = processing_strategy or SequentialProcessing()

embedder instance-attribute #

embedder: Embeddings = embedder

vector_store instance-attribute #

vector_store: VectorStore = vector_store

query_rephraser instance-attribute #

query_rephraser: QueryRephraser = query_rephraser or NoopQueryRephraser()

reranker instance-attribute #

reranker: Reranker = reranker or NoopReranker()

document_processor_router instance-attribute #

processing_strategy instance-attribute #

from_config classmethod #

from_config(config: dict) -> DocumentSearch

Creates and returns an instance of the DocumentSearch class from the given configuration.

PARAMETER DESCRIPTION
config

A dictionary containing the configuration for initializing the DocumentSearch instance.

TYPE: dict

RETURNS DESCRIPTION
DocumentSearch

An initialized instance of the DocumentSearch class.

TYPE: DocumentSearch

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@classmethod
def from_config(cls, config: dict) -> "DocumentSearch":
    """
    Creates and returns an instance of the DocumentSearch class from the given configuration.

    Args:
        config: A dictionary containing the configuration for initializing the DocumentSearch instance.

    Returns:
        DocumentSearch: An initialized instance of the DocumentSearch class.
    """
    embedder = get_embeddings(config["embedder"])
    query_rephraser = get_rephraser(config.get("rephraser"))
    reranker = get_reranker(config.get("reranker"))
    vector_store = get_vector_store(config["vector_store"])
    processing_strategy = get_processing_strategy(config.get("processing_strategy"))

    providers_config_dict: dict = config.get("providers", {})
    providers_config = DocumentProcessorRouter.from_dict_to_providers_config(providers_config_dict)
    document_processor_router = DocumentProcessorRouter.from_config(providers_config)

    return cls(embedder, vector_store, query_rephraser, reranker, document_processor_router, processing_strategy)

search async #

search(query: str, config: SearchConfig | None = None) -> Sequence[Element]

Search for the most relevant chunks for a query.

PARAMETER DESCRIPTION
query

The query to search for.

TYPE: str

config

The search configuration.

TYPE: SearchConfig | None DEFAULT: None

RETURNS DESCRIPTION
Sequence[Element]

A list of chunks.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@traceable
async def search(self, query: str, config: SearchConfig | None = None) -> Sequence[Element]:
    """
    Search for the most relevant chunks for a query.

    Args:
        query: The query to search for.
        config: The search configuration.

    Returns:
        A list of chunks.
    """
    config = config or SearchConfig()
    queries = await self.query_rephraser.rephrase(query)
    elements = []
    for rephrased_query in queries:
        search_vector = await self.embedder.embed_text([rephrased_query])
        entries = await self.vector_store.retrieve(
            vector=search_vector[0],
            options=VectorStoreOptions(**config.vector_store_kwargs),
        )
        elements.extend([Element.from_vector_db_entry(entry) for entry in entries])

    return await self.reranker.rerank(
        elements=elements,
        query=query,
        options=RerankerOptions(**config.reranker_kwargs),
    )

ingest async #

ingest(documents: Sequence[DocumentMeta | Document | Source], document_processor: BaseProvider | None = None) -> None

Ingest multiple documents.

PARAMETER DESCRIPTION
documents

The documents or metadata of the documents to ingest.

TYPE: Sequence[DocumentMeta | Document | Source]

document_processor

The document processor to use. If not provided, the document processor will be determined based on the document metadata.

TYPE: BaseProvider | None DEFAULT: None

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@traceable
async def ingest(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    document_processor: BaseProvider | None = None,
) -> None:
    """
    Ingest multiple documents.

    Args:
        documents: The documents or metadata of the documents to ingest.
        document_processor: The document processor to use. If not provided, the document processor will be
            determined based on the document metadata.
    """
    elements = await self.processing_strategy.process_documents(
        documents, self.document_processor_router, document_processor
    )
    await self.insert_elements(elements)

insert_elements async #

insert_elements(elements: list[Element]) -> None

Insert Elements into the vector store.

PARAMETER DESCRIPTION
elements

The list of Elements to insert.

TYPE: list[Element]

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
async def insert_elements(self, elements: list[Element]) -> None:
    """
    Insert Elements into the vector store.

    Args:
        elements: The list of Elements to insert.
    """
    vectors = await self.embedder.embed_text([element.key for element in elements])

    image_elements = [element for element in elements if isinstance(element, ImageElement)]
    entries = [element.to_vector_db_entry(vector) for element, vector in zip(elements, vectors, strict=False)]

    if image_elements and self.embedder.image_support():
        image_vectors = await self.embedder.embed_image([element.image_bytes for element in image_elements])
        entries.extend(
            [
                element.to_vector_db_entry(vector)
                for element, vector in zip(image_elements, image_vectors, strict=False)
            ]
        )
    elif image_elements:
        warnings.warn(
            f"Image elements are not supported by the embedder {self.embedder}. "
            f"Skipping {len(image_elements)} image elements."
        )

    await self.vector_store.store(entries)