Document Search#

ragbits.document_search.DocumentSearchOptions #

Bases: Options, Generic[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]

Object representing the options for the document search.

ATTRIBUTE	DESCRIPTION
`query_rephraser_options`	The options for the query rephraser. TYPE: `QueryRephraserOptionsT \| None \| NotGiven`
`vector_store_options`	The options for the vector store. TYPE: `VectorStoreOptionsT \| None \| NotGiven`
`reranker_options`	The options for the reranker. TYPE: `RerankerOptionsT \| None \| NotGiven`

model_config `class-attribute` `instance-attribute` #

model_config = ConfigDict(extra='allow', arbitrary_types_allowed=True)

query_rephraser_options `class-attribute` `instance-attribute` #

query_rephraser_options: QueryRephraserOptionsT | None | NotGiven = NOT_GIVEN

vector_store_options `class-attribute` `instance-attribute` #

vector_store_options: VectorStoreOptionsT | None | NotGiven = NOT_GIVEN

reranker_options `class-attribute` `instance-attribute` #

reranker_options: RerankerOptionsT | None | NotGiven = NOT_GIVEN

dict #

dict() -> dict[str, Any]

Creates a dictionary representation of the Options instance. If a value is None, it will be replaced with a provider-specific not-given sentinel.

RETURNS	DESCRIPTION
`dict[str, Any]`	A dictionary representation of the Options instance.

Source code in packages/ragbits-core/src/ragbits/core/options.py

def dict(self) -> dict[str, Any]:  # type: ignore # mypy complains about overriding BaseModel.dict
    """
    Creates a dictionary representation of the Options instance.
    If a value is None, it will be replaced with a provider-specific not-given sentinel.

    Returns:
        A dictionary representation of the Options instance.
    """
    options = self.model_dump()

    return {
        key: self._not_given if value is None or isinstance(value, NotGiven) else value
        for key, value in options.items()
    }

ragbits.document_search.DocumentSearch #

DocumentSearch(vector_store: VectorStore[VectorStoreOptionsT], *, query_rephraser: QueryRephraser[QueryRephraserOptionsT] | None = None, reranker: Reranker[RerankerOptionsT] | None = None, default_options: DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] | None = None, ingest_strategy: IngestStrategy | None = None, parser_router: DocumentParserRouter | None = None, enricher_router: ElementEnricherRouter | None = None)

Bases: ConfigurableComponent[DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT]]

Main entrypoint to the document search functionality. It provides methods for document retrieval and ingestion.

Retrieval

Uses QueryRephraser to rephrase the query.
Uses VectorStore to retrieve the most relevant elements.
Uses Reranker to rerank the elements.

Ingestion

Uses IngestStrategy to orchestrate ingestion process.
Uses DocumentParserRouter to route the document to the appropriate DocumentParser to parse the content.
Uses ElementEnricherRouter to redirect the element to the appropriate ElementEnricher to enrich the element.

Initialize the DocumentSearch instance.

PARAMETER	DESCRIPTION
`vector_store`	The vector store to use for retrieval. TYPE: `VectorStore[VectorStoreOptionsT]`
`query_rephraser`	The query rephraser to use for retrieval. TYPE: `QueryRephraser[QueryRephraserOptionsT] \| None` DEFAULT: `None`
`reranker`	The reranker to use for retrieval. TYPE: `Reranker[RerankerOptionsT] \| None` DEFAULT: `None`
`default_options`	The default options for the search. TYPE: `DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] \| None` DEFAULT: `None`
`ingest_strategy`	The ingestion strategy to use for ingestion. TYPE: `IngestStrategy \| None` DEFAULT: `None`
`parser_router`	The document parser router to use for ingestion. TYPE: `DocumentParserRouter \| None` DEFAULT: `None`
`enricher_router`	The element enricher router to use for ingestion. TYPE: `ElementEnricherRouter \| None` DEFAULT: `None`

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py

def __init__(
    self,
    vector_store: VectorStore[VectorStoreOptionsT],
    *,
    query_rephraser: QueryRephraser[QueryRephraserOptionsT] | None = None,
    reranker: Reranker[RerankerOptionsT] | None = None,
    default_options: DocumentSearchOptions[
        QueryRephraserOptionsT,
        VectorStoreOptionsT,
        RerankerOptionsT,
    ]
    | None = None,
    ingest_strategy: IngestStrategy | None = None,
    parser_router: DocumentParserRouter | None = None,
    enricher_router: ElementEnricherRouter | None = None,
) -> None:
    """
    Initialize the DocumentSearch instance.

    Args:
        vector_store: The vector store to use for retrieval.
        query_rephraser: The query rephraser to use for retrieval.
        reranker: The reranker to use for retrieval.
        default_options: The default options for the search.
        ingest_strategy: The ingestion strategy to use for ingestion.
        parser_router: The document parser router to use for ingestion.
        enricher_router: The element enricher router to use for ingestion.
    """
    super().__init__(default_options=default_options)
    self.vector_store = vector_store
    self.query_rephraser = query_rephraser or NoopQueryRephraser()
    self.reranker = reranker or NoopReranker()
    self.ingest_strategy = ingest_strategy or SequentialIngestStrategy()
    self.parser_router = parser_router or DocumentParserRouter()
    self.enricher_router = enricher_router or ElementEnricherRouter()

default_options `instance-attribute` #

default_options: OptionsT = default_options or options_cls()

options_cls `class-attribute` `instance-attribute` #

options_cls: type[DocumentSearchOptions] = DocumentSearchOptions

default_module `class-attribute` #

default_module: ModuleType | None = document_search

configuration_key `class-attribute` #

configuration_key: str = 'document_search'

vector_store `instance-attribute` #

vector_store = vector_store

query_rephraser `instance-attribute` #

query_rephraser = query_rephraser or NoopQueryRephraser()

reranker `instance-attribute` #

reranker = reranker or NoopReranker()

ingest_strategy `instance-attribute` #

ingest_strategy = ingest_strategy or SequentialIngestStrategy()

parser_router `instance-attribute` #

parser_router = parser_router or DocumentParserRouter()

enricher_router `instance-attribute` #

enricher_router = enricher_router or ElementEnricherRouter()

subclass_from_config `classmethod` #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER	DESCRIPTION
`config`	A model containing configuration details for the class. TYPE: `ObjectConstructionConfig`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

RAISES	DESCRIPTION
`InvalidConfigError`	The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory `classmethod` #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory. Supports both synchronous and asynchronous factory functions.

PARAMETER	DESCRIPTION
`factory_path`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided factory function.

RAISES	DESCRIPTION
`InvalidConfigError`	The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory. Supports both synchronous and asynchronous factory functions.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)

    if asyncio.iscoroutinefunction(factory):
        try:
            loop = asyncio.get_running_loop()
            obj = asyncio.run_coroutine_threadsafe(factory, loop).result()
        except RuntimeError:
            obj = asyncio.run(factory())
    else:
        obj = factory()

    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")

    return obj

from_config `classmethod` #

from_config(config: dict) -> Self

Creates and returns an instance of the DocumentSearch class from the given configuration.

PARAMETER	DESCRIPTION
`config`	A configuration object containing the configuration for initializing the DocumentSearch instance. TYPE: `dict`

RETURNS	DESCRIPTION
`DocumentSearch`	An initialized instance of the DocumentSearch class. TYPE: `Self`

RAISES	DESCRIPTION
`ValidationError`	If the configuration doesn't follow the expected format.
`InvalidConfigError`	If one of the specified classes can't be found or is not the correct type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py

@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Creates and returns an instance of the DocumentSearch class from the given configuration.

    Args:
        config: A configuration object containing the configuration for initializing the DocumentSearch instance.

    Returns:
        DocumentSearch: An initialized instance of the DocumentSearch class.

    Raises:
        ValidationError: If the configuration doesn't follow the expected format.
        InvalidConfigError: If one of the specified classes can't be found or is not the correct type.
    """
    model = DocumentSearchConfig.model_validate(config)

    query_rephraser: QueryRephraser = QueryRephraser.subclass_from_config(model.rephraser)
    vector_store: VectorStore = VectorStore.subclass_from_config(model.vector_store)
    reranker: Reranker = Reranker.subclass_from_config(model.reranker)

    ingest_strategy = IngestStrategy.subclass_from_config(model.ingest_strategy)
    parser_router = DocumentParserRouter.from_config(model.parser_router)
    enricher_router = ElementEnricherRouter.from_config(model.enricher_router)

    return cls(
        vector_store=vector_store,
        query_rephraser=query_rephraser,
        reranker=reranker,
        ingest_strategy=ingest_strategy,
        parser_router=parser_router,
        enricher_router=enricher_router,
    )

preferred_subclass `classmethod` #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component prefferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER	DESCRIPTION
`config`	The CoreConfig instance containing preferred factory and configuration details. TYPE: `CoreConfig`
`factory_path_override`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str \| None` DEFAULT: `None`
`yaml_path_override`	A string representing the path to the YAML file containing the Ragstack instance configuration. Looks for the configuration under the key "document_search", and if not found, instantiates the class with the preferred configuration for each component. TYPE: `Path \| None` DEFAULT: `None`

RAISES	DESCRIPTION
`InvalidConfigError`	If the default factory or configuration can't be found.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py

@classmethod
def preferred_subclass(
    cls,
    config: CoreConfig,
    factory_path_override: str | None = None,
    yaml_path_override: Path | None = None,
) -> Self:
    """
    Tries to create an instance by looking at project's component prefferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration. Looks for the configuration under the key "document_search",
            and if not found, instantiates the class with the preferred configuration for each component.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)

        # Look for explicit document search configuration
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

        # Instantiate the class with the preferred configuration for each component
        return cls.from_config(preferences)

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if config.component_preference_config_path is not None:
        # Look for explicit document search configuration
        if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

        # Instantiate the class with the preferred configuration for each component
        return cls.from_config(config.preferred_instances_config)

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

search `async` #

search(query: str, options: DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] | None = None) -> Sequence[Element]

Search for the most relevant chunks for a query.

PARAMETER	DESCRIPTION
`query`	The query to search for. TYPE: `str`
`options`	The document search retrieval options. TYPE: `DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`Sequence[Element]`	A list of chunks.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py

async def search(
    self,
    query: str,
    options: DocumentSearchOptions[QueryRephraserOptionsT, VectorStoreOptionsT, RerankerOptionsT] | None = None,
) -> Sequence[Element]:
    """
    Search for the most relevant chunks for a query.

    Args:
        query: The query to search for.
        options: The document search retrieval options.

    Returns:
        A list of chunks.
    """
    merged_options = (self.default_options | options) if options else self.default_options
    query_rephraser_options = merged_options.query_rephraser_options or None
    vector_store_options = merged_options.vector_store_options or None
    reranker_options = merged_options.reranker_options or None

    with trace(query=query, options=merged_options) as outputs:
        queries = await self.query_rephraser.rephrase(query, query_rephraser_options)
        elements = [
            [
                Element.from_vector_db_entry(result.entry, result.score)
                for result in await self.vector_store.retrieve(query, vector_store_options)
            ]
            for query in queries
        ]
        outputs.results = await self.reranker.rerank(
            elements=elements,
            query=query,
            options=reranker_options,
        )

    return outputs.results

ingest `async` #

ingest(documents: str | Iterable[DocumentMeta | Document | Source], fail_on_error: bool = True) -> IngestExecutionResult

Ingest documents into the search index.

PARAMETER	DESCRIPTION
`documents`	A string representing a source-specific URI (e.g., "gcs://bucket/") or an iterable of `Document`, `DocumentMeta`, or `Source` objects. Examples of URI formats include: - "file:///path/to/files/.txt" - "gcs://bucket/folder/" - "huggingface://dataset/split/row" TYPE:* `str \| Iterable[DocumentMeta \| Document \| Source]`
`fail_on_error`	If True, raises IngestExecutionError when any errors are encountered during ingestion. If False, returns all errors encountered in the IngestExecutionResult. TYPE: `bool` DEFAULT: `True`

RETURNS	DESCRIPTION
`IngestExecutionResult`	An IngestExecutionResult containing the results of the ingestion process.

RAISES	DESCRIPTION
`IngestExecutionError`	If fail_on_error is True and any errors are encountered during ingestion.

Source code in packages/ragbits-document-search/src/ragbits/document_search/_main.py

@traceable
async def ingest(
    self,
    documents: str | Iterable[DocumentMeta | Document | Source],
    fail_on_error: bool = True,
) -> IngestExecutionResult:
    """
    Ingest documents into the search index.

    Args:
        documents: A string representing a source-specific URI (e.g., "gcs://bucket/*") or an iterable of
                   `Document`, `DocumentMeta`, or `Source` objects. Examples of URI formats include:
                   - "file:///path/to/files/*.txt"
                   - "gcs://bucket/folder/*"
                   - "huggingface://dataset/split/row"
        fail_on_error: If True, raises IngestExecutionError when any errors are encountered during ingestion.
                       If False, returns all errors encountered in the IngestExecutionResult.

    Returns:
        An IngestExecutionResult containing the results of the ingestion process.

    Raises:
        IngestExecutionError: If fail_on_error is True and any errors are encountered during ingestion.
    """
    resolved_documents = await SourceResolver.resolve(documents) if isinstance(documents, str) else documents
    results = await self.ingest_strategy(
        documents=resolved_documents,
        vector_store=self.vector_store,
        parser_router=self.parser_router,
        enricher_router=self.enricher_router,
    )

    if fail_on_error and results.failed:
        raise IngestExecutionError(results.failed)

    return results

Document Search#

ragbits.document_search.DocumentSearchOptions #

model_config class-attribute instance-attribute #

query_rephraser_options class-attribute instance-attribute #

vector_store_options class-attribute instance-attribute #

reranker_options class-attribute instance-attribute #

dict #

ragbits.document_search.DocumentSearch #

default_options instance-attribute #

options_cls class-attribute instance-attribute #

default_module class-attribute #

configuration_key class-attribute #

vector_store instance-attribute #

query_rephraser instance-attribute #

reranker instance-attribute #

ingest_strategy instance-attribute #

parser_router instance-attribute #

enricher_router instance-attribute #

subclass_from_config classmethod #

subclass_from_factory classmethod #

from_config classmethod #

preferred_subclass classmethod #

search async #

ingest async #

model_config `class-attribute` `instance-attribute` #

query_rephraser_options `class-attribute` `instance-attribute` #

vector_store_options `class-attribute` `instance-attribute` #

reranker_options `class-attribute` `instance-attribute` #

default_options `instance-attribute` #

options_cls `class-attribute` `instance-attribute` #

default_module `class-attribute` #

configuration_key `class-attribute` #

vector_store `instance-attribute` #

query_rephraser `instance-attribute` #

reranker `instance-attribute` #

ingest_strategy `instance-attribute` #

parser_router `instance-attribute` #

enricher_router `instance-attribute` #

subclass_from_config `classmethod` #

subclass_from_factory `classmethod` #

from_config `classmethod` #

preferred_subclass `classmethod` #

search `async` #

ingest `async` #