Skip to content

Document Search#

ragbits.document_search.DocumentSearch #

DocumentSearch(vector_store: VectorStore, query_rephraser: QueryRephraser | None = None, reranker: Reranker | None = None, ingest_strategy: IngestStrategy | None = None, parser_router: DocumentParserRouter | None = None, enricher_router: ElementEnricherRouter | None = None)

Bases: WithConstructionConfig

A main entrypoint to the DocumentSearch functionality.

It provides methods for both ingestion and retrieval.

Retrieval:

1. Uses QueryRephraser to rephrase the query.
2. Uses VectorStore to retrieve the most relevant chunks.
3. Uses Reranker to rerank the chunks.
Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
def __init__(
    self,
    vector_store: VectorStore,
    query_rephraser: QueryRephraser | None = None,
    reranker: Reranker | None = None,
    ingest_strategy: IngestStrategy | None = None,
    parser_router: DocumentParserRouter | None = None,
    enricher_router: ElementEnricherRouter | None = None,
) -> None:
    self.vector_store = vector_store
    self.query_rephraser = query_rephraser or NoopQueryRephraser()
    self.reranker = reranker or NoopReranker()
    self.ingest_strategy = ingest_strategy or SequentialIngestStrategy()
    self.parser_router = parser_router or DocumentParserRouter()
    self.enricher_router = enricher_router or ElementEnricherRouter()

default_module class-attribute #

default_module: ModuleType | None = document_search

configuration_key class-attribute #

configuration_key: str = 'document_search'

vector_store instance-attribute #

vector_store: VectorStore = vector_store

query_rephraser instance-attribute #

reranker instance-attribute #

reranker: Reranker = reranker or NoopReranker()

ingest_strategy instance-attribute #

parser_router instance-attribute #

enricher_router instance-attribute #

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

from_config classmethod #

from_config(config: dict) -> Self

Creates and returns an instance of the DocumentSearch class from the given configuration.

PARAMETER DESCRIPTION
config

A configuration object containing the configuration for initializing the DocumentSearch instance.

TYPE: dict

RETURNS DESCRIPTION
DocumentSearch

An initialized instance of the DocumentSearch class.

TYPE: Self

RAISES DESCRIPTION
ValidationError

If the configuration doesn't follow the expected format.

InvalidConfigError

If one of the specified classes can't be found or is not the correct type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Creates and returns an instance of the DocumentSearch class from the given configuration.

    Args:
        config: A configuration object containing the configuration for initializing the DocumentSearch instance.

    Returns:
        DocumentSearch: An initialized instance of the DocumentSearch class.

    Raises:
        ValidationError: If the configuration doesn't follow the expected format.
        InvalidConfigError: If one of the specified classes can't be found or is not the correct type.
    """
    model = DocumentSearchConfig.model_validate(config)

    query_rephraser = QueryRephraser.subclass_from_config(model.rephraser)
    reranker: Reranker = Reranker.subclass_from_config(model.reranker)
    vector_store: VectorStore = VectorStore.subclass_from_config(model.vector_store)

    ingest_strategy = IngestStrategy.subclass_from_config(model.ingest_strategy)
    parser_router = DocumentParserRouter.from_config(model.parser_router)
    enricher_router = ElementEnricherRouter.from_config(model.enricher_router)

    return cls(
        vector_store=vector_store,
        query_rephraser=query_rephraser,
        reranker=reranker,
        ingest_strategy=ingest_strategy,
        parser_router=parser_router,
        enricher_router=enricher_router,
    )

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component prefferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration. Looks for the configuration under the key "document_search", and if not found, instantiates the class with the preferred configuration for each component.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@classmethod
def preferred_subclass(
    cls,
    config: CoreConfig,
    factory_path_override: str | None = None,
    yaml_path_override: Path | None = None,
) -> Self:
    """
    Tries to create an instance by looking at project's component prefferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration. Looks for the configuration under the key "document_search",
            and if not found, instantiates the class with the preferred configuration for each component.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)

        # Look for explicit document search configuration
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

        # Instantiate the class with the preferred configuration for each component
        return cls.from_config(preferences)

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if config.component_preference_config_path is not None:
        # Look for explicit document search configuration
        if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

        # Instantiate the class with the preferred configuration for each component
        return cls.from_config(config.preferred_instances_config)

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

search async #

search(query: str, config: SearchConfig | None = None) -> Sequence[Element]

Search for the most relevant chunks for a query.

PARAMETER DESCRIPTION
query

The query to search for.

TYPE: str

config

The search configuration.

TYPE: SearchConfig | None DEFAULT: None

RETURNS DESCRIPTION
Sequence[Element]

A list of chunks.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
async def search(self, query: str, config: SearchConfig | None = None) -> Sequence[Element]:
    """
    Search for the most relevant chunks for a query.

    Args:
        query: The query to search for.
        config: The search configuration.

    Returns:
        A list of chunks.
    """
    config = config or SearchConfig()
    queries = await self.query_rephraser.rephrase(query)
    with trace(queries=queries, config=config, vectore_store=self.vector_store, reranker=self.reranker) as outputs:
        elements = []

        for rephrased_query in queries:
            results = await self.vector_store.retrieve(
                text=rephrased_query,
                options=VectorStoreOptions(**config.vector_store_kwargs),
            )
            elements.append([Element.from_vector_db_entry(result.entry) for result in results])

        outputs.search_results = await self.reranker.rerank(
            elements=elements,
            query=query,
            options=RerankerOptions(**config.reranker_kwargs),
        )
        return outputs.search_results

ingest async #

ingest(documents: str | Iterable[DocumentMeta | Document | Source]) -> IngestExecutionResult

Ingest documents into the search index.

PARAMETER DESCRIPTION
documents

Either: - A iterable of Document, DocumentMetadata, or Source objects - A source-specific URI string (e.g., "gcs://bucket/") to specify source location(s), for example: - "file:///path/to/files/.txt" - "gcs://bucket/folder/*" - "huggingface://dataset/split/row"

TYPE: str | Iterable[DocumentMeta | Document | Source]

RETURNS DESCRIPTION
IngestExecutionResult

The ingest execution result.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py
@traceable
async def ingest(self, documents: str | Iterable[DocumentMeta | Document | Source]) -> IngestExecutionResult:
    """
    Ingest documents into the search index.

    Args:
        documents: Either:
            - A iterable of `Document`, `DocumentMetadata`, or `Source` objects
            - A source-specific URI string (e.g., "gcs://bucket/*") to specify source location(s), for example:
                - "file:///path/to/files/*.txt"
                - "gcs://bucket/folder/*"
                - "huggingface://dataset/split/row"

    Returns:
        The ingest execution result.
    """
    resolved_documents = await SourceResolver.resolve(documents) if isinstance(documents, str) else documents
    return await self.ingest_strategy(
        documents=resolved_documents,
        vector_store=self.vector_store,
        parser_router=self.parser_router,
        enricher_router=self.enricher_router,
    )