Execution Strategies#

ragbits.document_search.ingestion.processor_strategies.ProcessingExecutionStrategy #

Bases: WithConstructionConfig, ABC

Base class for processing execution strategies that define how documents are processed to become elements.

Processing execution strategies are responsible for processing documents using the appropriate processor, which means that they don't usually determine the business logic of the processing itself, but rather how the processing is executed.

configuration_key `class-attribute` #

configuration_key: str

default_module `class-attribute` `instance-attribute` #

default_module: ClassVar = processor_strategies

subclass_from_config `classmethod` #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER	DESCRIPTION
`config`	A model containing configuration details for the class. TYPE: `ObjectContructionConfig`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

RAISES	DESCRIPTION
`InvalidConfigError`	The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory `classmethod` #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER	DESCRIPTION
`factory_path`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided factory function.

RAISES	DESCRIPTION
`InvalidConfigError`	The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

subclass_from_defaults `classmethod` #

subclass_from_defaults(defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at default configuration file, and default factory function. Takes optional overrides for both, which takes a higher precedence.

PARAMETER	DESCRIPTION
`defaults`	The CoreConfig instance containing default factory and configuration details. TYPE: `CoreConfig`
`factory_path_override`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str \| None` DEFAULT: `None`
`yaml_path_override`	A string representing the path to the YAML file containing the Ragstack instance configuration. TYPE: `Path \| None` DEFAULT: `None`

RAISES	DESCRIPTION
`InvalidConfigError`	If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_defaults(
    cls, defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at default configuration file, and default factory function.
    Takes optional overrides for both, which takes a higher precedence.

    Args:
        defaults: The CoreConfig instance containing default factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        config = get_config_from_yaml(yaml_path_override)
        if type_config := config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if default_factory := defaults.default_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(default_factory)

    if default_config := defaults.default_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(default_config))

    raise NoDefaultConfigError(f"Could not find default factory or configuration for {cls.configuration_key}")

from_config `classmethod` #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER	DESCRIPTION
`config`	A dictionary containing configuration details for the class. TYPE: `dict`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

to_document_meta `async` `staticmethod` #

to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta

Convert a document, document meta or source to a document meta object.

PARAMETER	DESCRIPTION
`document`	The document to convert. TYPE: `DocumentMeta \| Document \| Source`

RETURNS	DESCRIPTION
`DocumentMeta`	The document meta object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

@staticmethod
async def to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta:
    """
    Convert a document, document meta or source to a document meta object.

    Args:
        document: The document to convert.

    Returns:
        The document meta object.
    """
    if isinstance(document, Source):
        return await DocumentMeta.from_source(document)
    elif isinstance(document, DocumentMeta):
        return document
    else:
        return document.metadata

process_document `async` #

process_document(document: DocumentMeta | Document | Source, processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process a single document and return the elements.

PARAMETER	DESCRIPTION
`document`	The document to process. TYPE: `DocumentMeta \| Document \| Source`
`processor_router`	The document processor router to use. TYPE: `DocumentProcessorRouter`
`processor_overwrite`	Forces the use of a specific processor, instead of the one provided by the router. TYPE: `BaseProvider \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

async def process_document(
    self,
    document: DocumentMeta | Document | Source,
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process a single document and return the elements.

    Args:
        document: The document to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """
    document_meta = await self.to_document_meta(document)
    processor = processor_overwrite or processor_router.get_provider(document_meta)
    return await processor.process(document_meta)

process_documents `abstractmethod` `async` #

process_documents(documents: Sequence[DocumentMeta | Document | Source], processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process documents using the given processor and return the resulting elements.

PARAMETER	DESCRIPTION
`documents`	The documents to process. TYPE: `Sequence[DocumentMeta \| Document \| Source]`
`processor_router`	The document processor router to use. TYPE: `DocumentProcessorRouter`
`processor_overwrite`	Forces the use of a specific processor, instead of the one provided by the router. TYPE: `BaseProvider \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

@abstractmethod
async def process_documents(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process documents using the given processor and return the resulting elements.

    Args:
        documents: The documents to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """

ragbits.document_search.ingestion.processor_strategies.SequentialProcessing #

Bases: ProcessingExecutionStrategy

A processing execution strategy that processes documents in sequence, one at a time.

default_module `class-attribute` `instance-attribute` #

default_module: ClassVar = processor_strategies

configuration_key `class-attribute` #

configuration_key: str

subclass_from_config `classmethod` #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER	DESCRIPTION
`config`	A model containing configuration details for the class. TYPE: `ObjectContructionConfig`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

RAISES	DESCRIPTION
`InvalidConfigError`	The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory `classmethod` #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER	DESCRIPTION
`factory_path`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided factory function.

RAISES	DESCRIPTION
`InvalidConfigError`	The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

subclass_from_defaults `classmethod` #

subclass_from_defaults(defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at default configuration file, and default factory function. Takes optional overrides for both, which takes a higher precedence.

PARAMETER	DESCRIPTION
`defaults`	The CoreConfig instance containing default factory and configuration details. TYPE: `CoreConfig`
`factory_path_override`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str \| None` DEFAULT: `None`
`yaml_path_override`	A string representing the path to the YAML file containing the Ragstack instance configuration. TYPE: `Path \| None` DEFAULT: `None`

RAISES	DESCRIPTION
`InvalidConfigError`	If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_defaults(
    cls, defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at default configuration file, and default factory function.
    Takes optional overrides for both, which takes a higher precedence.

    Args:
        defaults: The CoreConfig instance containing default factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        config = get_config_from_yaml(yaml_path_override)
        if type_config := config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if default_factory := defaults.default_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(default_factory)

    if default_config := defaults.default_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(default_config))

    raise NoDefaultConfigError(f"Could not find default factory or configuration for {cls.configuration_key}")

from_config `classmethod` #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER	DESCRIPTION
`config`	A dictionary containing configuration details for the class. TYPE: `dict`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

to_document_meta `async` `staticmethod` #

to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta

Convert a document, document meta or source to a document meta object.

PARAMETER	DESCRIPTION
`document`	The document to convert. TYPE: `DocumentMeta \| Document \| Source`

RETURNS	DESCRIPTION
`DocumentMeta`	The document meta object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

@staticmethod
async def to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta:
    """
    Convert a document, document meta or source to a document meta object.

    Args:
        document: The document to convert.

    Returns:
        The document meta object.
    """
    if isinstance(document, Source):
        return await DocumentMeta.from_source(document)
    elif isinstance(document, DocumentMeta):
        return document
    else:
        return document.metadata

process_document `async` #

process_document(document: DocumentMeta | Document | Source, processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process a single document and return the elements.

PARAMETER	DESCRIPTION
`document`	The document to process. TYPE: `DocumentMeta \| Document \| Source`
`processor_router`	The document processor router to use. TYPE: `DocumentProcessorRouter`
`processor_overwrite`	Forces the use of a specific processor, instead of the one provided by the router. TYPE: `BaseProvider \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

async def process_document(
    self,
    document: DocumentMeta | Document | Source,
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process a single document and return the elements.

    Args:
        document: The document to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """
    document_meta = await self.to_document_meta(document)
    processor = processor_overwrite or processor_router.get_provider(document_meta)
    return await processor.process(document_meta)

process_documents `async` #

process_documents(documents: Sequence[DocumentMeta | Document | Source], processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process documents using the given processor and return the resulting elements.

PARAMETER	DESCRIPTION
`documents`	The documents to process. TYPE: `Sequence[DocumentMeta \| Document \| Source]`
`processor_router`	The document processor router to use. TYPE: `DocumentProcessorRouter`
`processor_overwrite`	Forces the use of a specific processor, instead of the one provided by the router. TYPE: `BaseProvider \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py

async def process_documents(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process documents using the given processor and return the resulting elements.

    Args:
        documents: The documents to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.

    Returns:
        A list of elements.
    """
    elements = []
    for document in documents:
        elements.extend(await self.process_document(document, processor_router, processor_overwrite))
    return elements

ragbits.document_search.ingestion.processor_strategies.BatchedAsyncProcessing #

BatchedAsyncProcessing(batch_size: int = 10)

Bases: ProcessingExecutionStrategy

A processing execution strategy that processes documents asynchronously in batches.

Initialize the BatchedAsyncProcessing instance.

PARAMETER	DESCRIPTION
`batch_size`	The size of the batch to process documents in. TYPE: `int` DEFAULT: `10`

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py

def __init__(self, batch_size: int = 10):
    """
    Initialize the BatchedAsyncProcessing instance.

    Args:
        batch_size: The size of the batch to process documents in.
    """
    self.batch_size = batch_size

default_module `class-attribute` `instance-attribute` #

default_module: ClassVar = processor_strategies

configuration_key `class-attribute` #

configuration_key: str

batch_size `instance-attribute` #

batch_size = batch_size

subclass_from_config `classmethod` #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER	DESCRIPTION
`config`	A model containing configuration details for the class. TYPE: `ObjectContructionConfig`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

RAISES	DESCRIPTION
`InvalidConfigError`	The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory `classmethod` #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER	DESCRIPTION
`factory_path`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided factory function.

RAISES	DESCRIPTION
`InvalidConfigError`	The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

subclass_from_defaults `classmethod` #

subclass_from_defaults(defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at default configuration file, and default factory function. Takes optional overrides for both, which takes a higher precedence.

PARAMETER	DESCRIPTION
`defaults`	The CoreConfig instance containing default factory and configuration details. TYPE: `CoreConfig`
`factory_path_override`	A string representing the path to the factory function in the format of "module.submodule:factory_name". TYPE: `str \| None` DEFAULT: `None`
`yaml_path_override`	A string representing the path to the YAML file containing the Ragstack instance configuration. TYPE: `Path \| None` DEFAULT: `None`

RAISES	DESCRIPTION
`InvalidConfigError`	If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def subclass_from_defaults(
    cls, defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at default configuration file, and default factory function.
    Takes optional overrides for both, which takes a higher precedence.

    Args:
        defaults: The CoreConfig instance containing default factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        config = get_config_from_yaml(yaml_path_override)
        if type_config := config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if default_factory := defaults.default_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(default_factory)

    if default_config := defaults.default_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(default_config))

    raise NoDefaultConfigError(f"Could not find default factory or configuration for {cls.configuration_key}")

from_config `classmethod` #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER	DESCRIPTION
`config`	A dictionary containing configuration details for the class. TYPE: `dict`

RETURNS	DESCRIPTION
`Self`	An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py

@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

to_document_meta `async` `staticmethod` #

to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta

Convert a document, document meta or source to a document meta object.

PARAMETER	DESCRIPTION
`document`	The document to convert. TYPE: `DocumentMeta \| Document \| Source`

RETURNS	DESCRIPTION
`DocumentMeta`	The document meta object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

@staticmethod
async def to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta:
    """
    Convert a document, document meta or source to a document meta object.

    Args:
        document: The document to convert.

    Returns:
        The document meta object.
    """
    if isinstance(document, Source):
        return await DocumentMeta.from_source(document)
    elif isinstance(document, DocumentMeta):
        return document
    else:
        return document.metadata

process_document `async` #

process_document(document: DocumentMeta | Document | Source, processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process a single document and return the elements.

PARAMETER	DESCRIPTION
`document`	The document to process. TYPE: `DocumentMeta \| Document \| Source`
`processor_router`	The document processor router to use. TYPE: `DocumentProcessorRouter`
`processor_overwrite`	Forces the use of a specific processor, instead of the one provided by the router. TYPE: `BaseProvider \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py

async def process_document(
    self,
    document: DocumentMeta | Document | Source,
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process a single document and return the elements.

    Args:
        document: The document to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """
    document_meta = await self.to_document_meta(document)
    processor = processor_overwrite or processor_router.get_provider(document_meta)
    return await processor.process(document_meta)

process_documents `async` #

process_documents(documents: Sequence[DocumentMeta | Document | Source], processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process documents using the given processor and return the resulting elements.

PARAMETER	DESCRIPTION
`documents`	The documents to process. TYPE: `Sequence[DocumentMeta \| Document \| Source]`
`processor_router`	The document processor router to use. TYPE: `DocumentProcessorRouter`
`processor_overwrite`	Forces the use of a specific processor, instead of the one provided by the router. TYPE: `BaseProvider \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

RETURNS	DESCRIPTION
`list[Element]`	A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py

async def process_documents(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process documents using the given processor and return the resulting elements.

    Args:
        documents: The documents to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.

    Returns:
        A list of elements.
    """
    semaphore = asyncio.Semaphore(self.batch_size)

    responses = await asyncio.gather(
        *[
            self._process_with_semaphore(semaphore, document, processor_router, processor_overwrite)
            for document in documents
        ]
    )

    # Return a flattened list of elements
    return [element for response in responses for element in response]

Execution Strategies#

ragbits.document_search.ingestion.processor_strategies.ProcessingExecutionStrategy #

configuration_key class-attribute #

default_module class-attribute instance-attribute #

subclass_from_config classmethod #

subclass_from_factory classmethod #

subclass_from_defaults classmethod #

from_config classmethod #

to_document_meta async staticmethod #

process_document async #

process_documents abstractmethod async #

ragbits.document_search.ingestion.processor_strategies.SequentialProcessing #

default_module class-attribute instance-attribute #

configuration_key class-attribute #

subclass_from_config classmethod #

subclass_from_factory classmethod #

subclass_from_defaults classmethod #

from_config classmethod #

to_document_meta async staticmethod #

process_document async #

process_documents async #

ragbits.document_search.ingestion.processor_strategies.BatchedAsyncProcessing #

default_module class-attribute instance-attribute #

configuration_key class-attribute #

batch_size instance-attribute #

subclass_from_config classmethod #

subclass_from_factory classmethod #

subclass_from_defaults classmethod #

from_config classmethod #

to_document_meta async staticmethod #

process_document async #

process_documents async #

configuration_key `class-attribute` #

default_module `class-attribute` `instance-attribute` #

subclass_from_config `classmethod` #

subclass_from_factory `classmethod` #

subclass_from_defaults `classmethod` #

from_config `classmethod` #

to_document_meta `async` `staticmethod` #

process_document `async` #

process_documents `abstractmethod` `async` #

default_module `class-attribute` `instance-attribute` #

configuration_key `class-attribute` #

subclass_from_config `classmethod` #

subclass_from_factory `classmethod` #

subclass_from_defaults `classmethod` #

from_config `classmethod` #

to_document_meta `async` `staticmethod` #

process_document `async` #

process_documents `async` #

default_module `class-attribute` `instance-attribute` #

configuration_key `class-attribute` #

batch_size `instance-attribute` #

subclass_from_config `classmethod` #

subclass_from_factory `classmethod` #

subclass_from_defaults `classmethod` #

from_config `classmethod` #

to_document_meta `async` `staticmethod` #

process_document `async` #

process_documents `async` #