Skip to content

Execution Strategies#

ragbits.document_search.ingestion.processor_strategies.ProcessingExecutionStrategy #

Bases: WithConstructionConfig, ABC

Base class for processing execution strategies that define how documents are processed to become elements.

Processing execution strategies are responsible for processing documents using the appropriate processor, which means that they don't usually determine the business logic of the processing itself, but rather how the processing is executed.

configuration_key class-attribute #

configuration_key: str

default_module class-attribute instance-attribute #

default_module: ClassVar = processor_strategies

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

subclass_from_defaults classmethod #

subclass_from_defaults(defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at default configuration file, and default factory function. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
defaults

The CoreConfig instance containing default factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_defaults(
    cls, defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at default configuration file, and default factory function.
    Takes optional overrides for both, which takes a higher precedence.

    Args:
        defaults: The CoreConfig instance containing default factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        config = get_config_from_yaml(yaml_path_override)
        if type_config := config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if default_factory := defaults.default_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(default_factory)

    if default_config := defaults.default_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(default_config))

    raise NoDefaultConfigError(f"Could not find default factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

to_document_meta async staticmethod #

to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta

Convert a document, document meta or source to a document meta object.

PARAMETER DESCRIPTION
document

The document to convert.

TYPE: DocumentMeta | Document | Source

RETURNS DESCRIPTION
DocumentMeta

The document meta object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
@staticmethod
async def to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta:
    """
    Convert a document, document meta or source to a document meta object.

    Args:
        document: The document to convert.

    Returns:
        The document meta object.
    """
    if isinstance(document, Source):
        return await DocumentMeta.from_source(document)
    elif isinstance(document, DocumentMeta):
        return document
    else:
        return document.metadata

process_document async #

process_document(document: DocumentMeta | Document | Source, processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process a single document and return the elements.

PARAMETER DESCRIPTION
document

The document to process.

TYPE: DocumentMeta | Document | Source

processor_router

The document processor router to use.

TYPE: DocumentProcessorRouter

processor_overwrite

Forces the use of a specific processor, instead of the one provided by the router.

TYPE: BaseProvider | None DEFAULT: None

RETURNS DESCRIPTION
list[Element]

A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
async def process_document(
    self,
    document: DocumentMeta | Document | Source,
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process a single document and return the elements.

    Args:
        document: The document to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """
    document_meta = await self.to_document_meta(document)
    processor = processor_overwrite or processor_router.get_provider(document_meta)
    return await processor.process(document_meta)

process_documents abstractmethod async #

process_documents(documents: Sequence[DocumentMeta | Document | Source], processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process documents using the given processor and return the resulting elements.

PARAMETER DESCRIPTION
documents

The documents to process.

TYPE: Sequence[DocumentMeta | Document | Source]

processor_router

The document processor router to use.

TYPE: DocumentProcessorRouter

processor_overwrite

Forces the use of a specific processor, instead of the one provided by the router.

TYPE: BaseProvider | None DEFAULT: None

RETURNS DESCRIPTION
list[Element]

A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
@abstractmethod
async def process_documents(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process documents using the given processor and return the resulting elements.

    Args:
        documents: The documents to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """

ragbits.document_search.ingestion.processor_strategies.SequentialProcessing #

Bases: ProcessingExecutionStrategy

A processing execution strategy that processes documents in sequence, one at a time.

default_module class-attribute instance-attribute #

default_module: ClassVar = processor_strategies

configuration_key class-attribute #

configuration_key: str

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

subclass_from_defaults classmethod #

subclass_from_defaults(defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at default configuration file, and default factory function. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
defaults

The CoreConfig instance containing default factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_defaults(
    cls, defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at default configuration file, and default factory function.
    Takes optional overrides for both, which takes a higher precedence.

    Args:
        defaults: The CoreConfig instance containing default factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        config = get_config_from_yaml(yaml_path_override)
        if type_config := config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if default_factory := defaults.default_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(default_factory)

    if default_config := defaults.default_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(default_config))

    raise NoDefaultConfigError(f"Could not find default factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

to_document_meta async staticmethod #

to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta

Convert a document, document meta or source to a document meta object.

PARAMETER DESCRIPTION
document

The document to convert.

TYPE: DocumentMeta | Document | Source

RETURNS DESCRIPTION
DocumentMeta

The document meta object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
@staticmethod
async def to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta:
    """
    Convert a document, document meta or source to a document meta object.

    Args:
        document: The document to convert.

    Returns:
        The document meta object.
    """
    if isinstance(document, Source):
        return await DocumentMeta.from_source(document)
    elif isinstance(document, DocumentMeta):
        return document
    else:
        return document.metadata

process_document async #

process_document(document: DocumentMeta | Document | Source, processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process a single document and return the elements.

PARAMETER DESCRIPTION
document

The document to process.

TYPE: DocumentMeta | Document | Source

processor_router

The document processor router to use.

TYPE: DocumentProcessorRouter

processor_overwrite

Forces the use of a specific processor, instead of the one provided by the router.

TYPE: BaseProvider | None DEFAULT: None

RETURNS DESCRIPTION
list[Element]

A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
async def process_document(
    self,
    document: DocumentMeta | Document | Source,
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process a single document and return the elements.

    Args:
        document: The document to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """
    document_meta = await self.to_document_meta(document)
    processor = processor_overwrite or processor_router.get_provider(document_meta)
    return await processor.process(document_meta)

process_documents async #

process_documents(documents: Sequence[DocumentMeta | Document | Source], processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process documents using the given processor and return the resulting elements.

PARAMETER DESCRIPTION
documents

The documents to process.

TYPE: Sequence[DocumentMeta | Document | Source]

processor_router

The document processor router to use.

TYPE: DocumentProcessorRouter

processor_overwrite

Forces the use of a specific processor, instead of the one provided by the router.

TYPE: BaseProvider | None DEFAULT: None

RETURNS DESCRIPTION
list[Element]

A list of elements.

RETURNS DESCRIPTION
list[Element]

A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/sequential.py
async def process_documents(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process documents using the given processor and return the resulting elements.

    Args:
        documents: The documents to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.

    Returns:
        A list of elements.
    """
    elements = []
    for document in documents:
        elements.extend(await self.process_document(document, processor_router, processor_overwrite))
    return elements

ragbits.document_search.ingestion.processor_strategies.BatchedAsyncProcessing #

BatchedAsyncProcessing(batch_size: int = 10)

Bases: ProcessingExecutionStrategy

A processing execution strategy that processes documents asynchronously in batches.

Initialize the BatchedAsyncProcessing instance.

PARAMETER DESCRIPTION
batch_size

The size of the batch to process documents in.

TYPE: int DEFAULT: 10

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py
def __init__(self, batch_size: int = 10):
    """
    Initialize the BatchedAsyncProcessing instance.

    Args:
        batch_size: The size of the batch to process documents in.
    """
    self.batch_size = batch_size

default_module class-attribute instance-attribute #

default_module: ClassVar = processor_strategies

configuration_key class-attribute #

configuration_key: str

batch_size instance-attribute #

batch_size = batch_size

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

subclass_from_defaults classmethod #

subclass_from_defaults(defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at default configuration file, and default factory function. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
defaults

The CoreConfig instance containing default factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_defaults(
    cls, defaults: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at default configuration file, and default factory function.
    Takes optional overrides for both, which takes a higher precedence.

    Args:
        defaults: The CoreConfig instance containing default factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        config = get_config_from_yaml(yaml_path_override)
        if type_config := config.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if default_factory := defaults.default_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(default_factory)

    if default_config := defaults.default_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(default_config))

    raise NoDefaultConfigError(f"Could not find default factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

to_document_meta async staticmethod #

to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta

Convert a document, document meta or source to a document meta object.

PARAMETER DESCRIPTION
document

The document to convert.

TYPE: DocumentMeta | Document | Source

RETURNS DESCRIPTION
DocumentMeta

The document meta object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
@staticmethod
async def to_document_meta(document: DocumentMeta | Document | Source) -> DocumentMeta:
    """
    Convert a document, document meta or source to a document meta object.

    Args:
        document: The document to convert.

    Returns:
        The document meta object.
    """
    if isinstance(document, Source):
        return await DocumentMeta.from_source(document)
    elif isinstance(document, DocumentMeta):
        return document
    else:
        return document.metadata

process_document async #

process_document(document: DocumentMeta | Document | Source, processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process a single document and return the elements.

PARAMETER DESCRIPTION
document

The document to process.

TYPE: DocumentMeta | Document | Source

processor_router

The document processor router to use.

TYPE: DocumentProcessorRouter

processor_overwrite

Forces the use of a specific processor, instead of the one provided by the router.

TYPE: BaseProvider | None DEFAULT: None

RETURNS DESCRIPTION
list[Element]

A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/base.py
async def process_document(
    self,
    document: DocumentMeta | Document | Source,
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process a single document and return the elements.

    Args:
        document: The document to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.
    """
    document_meta = await self.to_document_meta(document)
    processor = processor_overwrite or processor_router.get_provider(document_meta)
    return await processor.process(document_meta)

process_documents async #

process_documents(documents: Sequence[DocumentMeta | Document | Source], processor_router: DocumentProcessorRouter, processor_overwrite: BaseProvider | None = None) -> list[Element]

Process documents using the given processor and return the resulting elements.

PARAMETER DESCRIPTION
documents

The documents to process.

TYPE: Sequence[DocumentMeta | Document | Source]

processor_router

The document processor router to use.

TYPE: DocumentProcessorRouter

processor_overwrite

Forces the use of a specific processor, instead of the one provided by the router.

TYPE: BaseProvider | None DEFAULT: None

RETURNS DESCRIPTION
list[Element]

A list of elements.

RETURNS DESCRIPTION
list[Element]

A list of elements.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/processor_strategies/batched.py
async def process_documents(
    self,
    documents: Sequence[DocumentMeta | Document | Source],
    processor_router: DocumentProcessorRouter,
    processor_overwrite: BaseProvider | None = None,
) -> list[Element]:
    """
    Process documents using the given processor and return the resulting elements.

    Args:
        documents: The documents to process.
        processor_router: The document processor router to use.
        processor_overwrite: Forces the use of a specific processor, instead of the one provided by the router.

    Returns:
        A list of elements.

    Returns:
        A list of elements.
    """
    semaphore = asyncio.Semaphore(self.batch_size)

    responses = await asyncio.gather(
        *[
            self._process_with_semaphore(semaphore, document, processor_router, processor_overwrite)
            for document in documents
        ]
    )

    # Return a flattened list of elements
    return [element for response in responses for element in response]