Skip to content

Document Processing#

ragbits.document_search.ingestion.document_processor.DocumentProcessorRouter #

DocumentProcessorRouter(providers: ProvidersConfig)

The DocumentProcessorRouter is responsible for routing the document to the correct provider based on the document metadata such as the document type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
def __init__(self, providers: ProvidersConfig):
    self._providers = providers

from_dict_to_providers_config staticmethod #

from_dict_to_providers_config(dict_config: dict[str, ObjectContructionConfig]) -> ProvidersConfig

Creates ProvidersConfig from dictionary that maps document types to the provider configuration.

PARAMETER DESCRIPTION
dict_config

The dictionary with configuration.

TYPE: dict[str, ObjectContructionConfig]

RETURNS DESCRIPTION
ProvidersConfig

ProvidersConfig object.

RAISES DESCRIPTION
InvalidConfigError

If a provider class can't be found or is not the correct type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
@staticmethod
def from_dict_to_providers_config(dict_config: dict[str, ObjectContructionConfig]) -> ProvidersConfig:
    """
    Creates ProvidersConfig from dictionary that maps document types to the provider configuration.

    Args:
        dict_config: The dictionary with configuration.

    Returns:
        ProvidersConfig object.

    Raises:
        InvalidConfigError: If a provider class can't be found or is not the correct type.
    """
    providers_config = {}

    for document_type, config in dict_config.items():
        providers_config[DocumentType(document_type)] = cast(
            Callable[[], BaseProvider] | BaseProvider,
            BaseProvider.subclass_from_config(config),
        )

    return providers_config

from_config classmethod #

from_config(providers: ProvidersConfig | None = None) -> DocumentProcessorRouter

Create a DocumentProcessorRouter from a configuration. If the configuration is not provided, the default configuration will be used. If the configuration is provided, it will be merged with the default configuration, overriding the default values for the document types that are defined in the configuration. Example of the configuration: { DocumentType.TXT: YourCustomProviderClass(), DocumentType.PDF: UnstructuredProvider(), }

PARAMETER DESCRIPTION
providers

The dictionary with the providers configuration, mapping the document types to the provider class.

TYPE: ProvidersConfig | None DEFAULT: None

RETURNS DESCRIPTION
DocumentProcessorRouter

The DocumentProcessorRouter.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
@classmethod
def from_config(cls, providers: ProvidersConfig | None = None) -> "DocumentProcessorRouter":
    """
    Create a DocumentProcessorRouter from a configuration. If the configuration is not provided, the default
    configuration will be used. If the configuration is provided, it will be merged with the default configuration,
    overriding the default values for the document types that are defined in the configuration.
    Example of the configuration:
    {
        DocumentType.TXT: YourCustomProviderClass(),
        DocumentType.PDF: UnstructuredProvider(),
    }

    Args:
        providers: The dictionary with the providers configuration, mapping the document types to the
            provider class.

    Returns:
        The DocumentProcessorRouter.
    """
    config: MutableMapping[DocumentType, Callable[[], BaseProvider] | BaseProvider] = copy.deepcopy(
        DEFAULT_PROVIDERS_CONFIG
    )
    config.update(providers if providers is not None else {})

    return cls(providers=config)

get_provider #

get_provider(document_meta: DocumentMeta) -> BaseProvider

Get the provider for the document.

PARAMETER DESCRIPTION
document_meta

The document metadata.

TYPE: DocumentMeta

RETURNS DESCRIPTION
BaseProvider

The provider for processing the document.

RAISES DESCRIPTION
ValueError

If no provider is found for the document type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
def get_provider(self, document_meta: DocumentMeta) -> BaseProvider:
    """
    Get the provider for the document.

    Args:
        document_meta: The document metadata.

    Returns:
        The provider for processing the document.

    Raises:
        ValueError: If no provider is found for the document type.
    """
    provider_class_or_provider = self._providers.get(document_meta.document_type)
    if provider_class_or_provider is None:
        raise ValueError(f"No provider found for the document type {document_meta.document_type}")
    elif isinstance(provider_class_or_provider, BaseProvider):
        provider = provider_class_or_provider
    else:
        provider = provider_class_or_provider()
    return provider

Providers#

ragbits.document_search.ingestion.providers.base.BaseProvider #

Bases: WithConstructionConfig, ABC

A base class for the document processing providers.

default_module class-attribute instance-attribute #

default_module: ClassVar = providers

configuration_key class-attribute instance-attribute #

configuration_key: ClassVar = 'provider'

SUPPORTED_DOCUMENT_TYPES instance-attribute #

SUPPORTED_DOCUMENT_TYPES: set[DocumentType]

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

process abstractmethod async #

process(document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]

Process the document.

PARAMETER DESCRIPTION
document_meta

The document to process.

TYPE: DocumentMeta

RETURNS DESCRIPTION
Sequence[Element | IntermediateElement]

The list of elements extracted from the document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
@abstractmethod
async def process(self, document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]:
    """
    Process the document.

    Args:
        document_meta: The document to process.

    Returns:
        The list of elements extracted from the document.
    """

validate_document_type #

validate_document_type(document_type: DocumentType) -> None

Check if the provider supports the document type.

PARAMETER DESCRIPTION
document_type

The document type.

TYPE: DocumentType

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
def validate_document_type(self, document_type: DocumentType) -> None:
    """
    Check if the provider supports the document type.

    Args:
        document_type: The document type.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.
    """
    if document_type not in self.SUPPORTED_DOCUMENT_TYPES:
        raise DocumentTypeNotSupportedError(provider_name=self.__class__.__name__, document_type=document_type)

ragbits.document_search.ingestion.providers.dummy.DummyProvider #

Bases: BaseProvider

This is a mock provider that returns a TextElement with the content of the document. It should be used for testing purposes only.

default_module class-attribute instance-attribute #

default_module: ClassVar = providers

configuration_key class-attribute instance-attribute #

configuration_key: ClassVar = 'provider'

SUPPORTED_DOCUMENT_TYPES class-attribute instance-attribute #

SUPPORTED_DOCUMENT_TYPES = {TXT, MD}

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

validate_document_type #

validate_document_type(document_type: DocumentType) -> None

Check if the provider supports the document type.

PARAMETER DESCRIPTION
document_type

The document type.

TYPE: DocumentType

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
def validate_document_type(self, document_type: DocumentType) -> None:
    """
    Check if the provider supports the document type.

    Args:
        document_type: The document type.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.
    """
    if document_type not in self.SUPPORTED_DOCUMENT_TYPES:
        raise DocumentTypeNotSupportedError(provider_name=self.__class__.__name__, document_type=document_type)

process async #

process(document_meta: DocumentMeta) -> list[Element | IntermediateElement]

Process the text document.

PARAMETER DESCRIPTION
document_meta

The document to process.

TYPE: DocumentMeta

RETURNS DESCRIPTION
list[Element | IntermediateElement]

List with a single TextElement containing the content of the document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/dummy.py
async def process(self, document_meta: DocumentMeta) -> list[Element | IntermediateElement]:
    """
    Process the text document.

    Args:
        document_meta: The document to process.

    Returns:
        List with a single TextElement containing the content of the document.
    """
    self.validate_document_type(document_meta.document_type)

    document = await document_meta.fetch()
    if isinstance(document, TextDocument):
        return [TextElement(content=document.content, document_meta=document_meta)]
    return []

ragbits.document_search.ingestion.providers.unstructured.UnstructuredDefaultProvider #

UnstructuredDefaultProvider(partition_kwargs: dict | None = None, chunking_kwargs: dict | None = None, api_key: str | None = None, api_server: str | None = None, use_api: bool = False, ignore_images: bool = False)

Bases: BaseProvider

A provider that uses the Unstructured API or local SDK to process the documents.

Initialize the UnstructuredDefaultProvider.

PARAMETER DESCRIPTION
partition_kwargs

The additional arguments for the partitioning. Refer to the Unstructured API documentation for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters

TYPE: dict | None DEFAULT: None

chunking_kwargs

The additional arguments for the chunking.

TYPE: dict | None DEFAULT: None

api_key

The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment variable will be used.

TYPE: str | None DEFAULT: None

api_server

The API server URL to use for the Unstructured API. If not specified, the UNSTRUCTURED_SERVER_URL environment variable will be used.

TYPE: str | None DEFAULT: None

use_api

whether to use Unstructured API, otherwise use local version of Unstructured library

TYPE: bool DEFAULT: False

ignore_images

if True images will be skipped

TYPE: bool DEFAULT: False

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py
def __init__(
    self,
    partition_kwargs: dict | None = None,
    chunking_kwargs: dict | None = None,
    api_key: str | None = None,
    api_server: str | None = None,
    use_api: bool = False,
    ignore_images: bool = False,
) -> None:
    """Initialize the UnstructuredDefaultProvider.

    Args:
        partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
            for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
        chunking_kwargs: The additional arguments for the chunking.
        api_key: The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment
            variable will be used.
        api_server: The API server URL to use for the Unstructured API. If not specified, the
            UNSTRUCTURED_SERVER_URL environment variable will be used.
        use_api: whether to use Unstructured API, otherwise use local version of Unstructured library
        ignore_images: if True images will be skipped
    """
    self.partition_kwargs = partition_kwargs or DEFAULT_PARTITION_KWARGS
    self.chunking_kwargs = chunking_kwargs or DEFAULT_CHUNKING_KWARGS
    self.api_key = api_key
    self.api_server = api_server
    self.use_api = use_api
    self._client: UnstructuredClient | None = None
    self.ignore_images = ignore_images

default_module class-attribute instance-attribute #

default_module: ClassVar = providers

configuration_key class-attribute instance-attribute #

configuration_key: ClassVar = 'provider'

SUPPORTED_DOCUMENT_TYPES class-attribute instance-attribute #

SUPPORTED_DOCUMENT_TYPES = {TXT, MD, DOCX, DOC, PPTX, PPT, XLSX, XLS, CSV, HTML, EPUB, ORG, ODT, RST, RTF, TSV, XML}

partition_kwargs instance-attribute #

partition_kwargs = partition_kwargs or DEFAULT_PARTITION_KWARGS

chunking_kwargs instance-attribute #

chunking_kwargs = chunking_kwargs or DEFAULT_CHUNKING_KWARGS

api_key instance-attribute #

api_key = api_key

api_server instance-attribute #

api_server = api_server

use_api instance-attribute #

use_api = use_api

ignore_images instance-attribute #

ignore_images = ignore_images

client property #

client: UnstructuredClient

Get the UnstructuredClient instance. If the client is not initialized, it will be created.

RETURNS DESCRIPTION
UnstructuredClient

The UnstructuredClient instance.

RAISES DESCRIPTION
ValueError

If the UNSTRUCTURED_API_KEY_ENV environment variable is not set.

ValueError

If the UNSTRUCTURED_SERVER_URL_ENV environment variable is not set.

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

validate_document_type #

validate_document_type(document_type: DocumentType) -> None

Check if the provider supports the document type.

PARAMETER DESCRIPTION
document_type

The document type.

TYPE: DocumentType

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
def validate_document_type(self, document_type: DocumentType) -> None:
    """
    Check if the provider supports the document type.

    Args:
        document_type: The document type.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.
    """
    if document_type not in self.SUPPORTED_DOCUMENT_TYPES:
        raise DocumentTypeNotSupportedError(provider_name=self.__class__.__name__, document_type=document_type)

process async #

process(document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]

Process the document using the Unstructured API.

PARAMETER DESCRIPTION
document_meta

The document to process.

TYPE: DocumentMeta

RETURNS DESCRIPTION
Sequence[Element | IntermediateElement]

The list of elements extracted from the document.

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py
async def process(self, document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]:
    """
    Process the document using the Unstructured API.

    Args:
        document_meta: The document to process.

    Returns:
        The list of elements extracted from the document.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.

    """
    with trace(
        partition_arg=self.partition_kwargs,
        chunking_arg=self.chunking_kwargs,
        api_server=self.api_server,
        api_key=self.api_key,
        ignore_images=self.ignore_images,
    ) as outputs:
        self.validate_document_type(document_meta.document_type)
        document = await document_meta.fetch()

        if self.use_api:
            res = await self.client.general.partition_async(
                request={
                    "partition_parameters": {
                        "files": {
                            "content": document.local_path.read_bytes(),
                            "file_name": document.local_path.name,
                        },
                        "coordinates": True,
                        **self.partition_kwargs,
                    }
                }
            )
            elements = elements_from_dicts(res.elements)  # type: ignore
        else:
            elements = partition(
                file=BytesIO(document.local_path.read_bytes()),
                metadata_filename=document.local_path.name,
                **self.partition_kwargs,
            )

        outputs.results = await self._chunk_and_convert(elements, document_meta, document.local_path)
        return outputs.results

ragbits.document_search.ingestion.providers.unstructured.UnstructuredImageProvider #

UnstructuredImageProvider(partition_kwargs: dict | None = None, chunking_kwargs: dict | None = None, api_key: str | None = None, api_server: str | None = None, use_api: bool = False)

Bases: UnstructuredDefaultProvider

A specialized provider that handles pngs and jpgs using the Unstructured

Initialize the UnstructuredPdfProvider.

PARAMETER DESCRIPTION
partition_kwargs

The additional arguments for the partitioning. Refer to the Unstructured API documentation for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters

TYPE: dict | None DEFAULT: None

chunking_kwargs

The additional arguments for the chunking.

TYPE: dict | None DEFAULT: None

api_key

The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment variable will be used.

TYPE: str | None DEFAULT: None

api_server

The API server URL to use for the Unstructured API. If not specified, the UNSTRUCTURED_SERVER_URL environment variable will be used.

TYPE: str | None DEFAULT: None

use_api

Whether to use the Unstructured API. If False, the provider will only use the local processing.

TYPE: bool DEFAULT: False

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py
def __init__(
    self,
    partition_kwargs: dict | None = None,
    chunking_kwargs: dict | None = None,
    api_key: str | None = None,
    api_server: str | None = None,
    use_api: bool = False,
) -> None:
    """Initialize the UnstructuredPdfProvider.

    Args:
        partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
            for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
        chunking_kwargs: The additional arguments for the chunking.
        api_key: The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment
            variable will be used.
        api_server: The API server URL to use for the Unstructured API. If not specified, the
            UNSTRUCTURED_SERVER_URL environment variable will be used.
        use_api: Whether to use the Unstructured API. If False, the provider will only use the local processing.
    """
    super().__init__(partition_kwargs, chunking_kwargs, api_key, api_server, use_api)

default_module class-attribute instance-attribute #

default_module: ClassVar = providers

configuration_key class-attribute instance-attribute #

configuration_key: ClassVar = 'provider'

partition_kwargs instance-attribute #

partition_kwargs = partition_kwargs or DEFAULT_PARTITION_KWARGS

chunking_kwargs instance-attribute #

chunking_kwargs = chunking_kwargs or DEFAULT_CHUNKING_KWARGS

api_key instance-attribute #

api_key = api_key

api_server instance-attribute #

api_server = api_server

use_api instance-attribute #

use_api = use_api

ignore_images instance-attribute #

ignore_images = ignore_images

client property #

client: UnstructuredClient

Get the UnstructuredClient instance. If the client is not initialized, it will be created.

RETURNS DESCRIPTION
UnstructuredClient

The UnstructuredClient instance.

RAISES DESCRIPTION
ValueError

If the UNSTRUCTURED_API_KEY_ENV environment variable is not set.

ValueError

If the UNSTRUCTURED_SERVER_URL_ENV environment variable is not set.

SUPPORTED_DOCUMENT_TYPES class-attribute instance-attribute #

SUPPORTED_DOCUMENT_TYPES = {JPG, PNG}

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

process async #

process(document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]

Process the document using the Unstructured API.

PARAMETER DESCRIPTION
document_meta

The document to process.

TYPE: DocumentMeta

RETURNS DESCRIPTION
Sequence[Element | IntermediateElement]

The list of elements extracted from the document.

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py
async def process(self, document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]:
    """
    Process the document using the Unstructured API.

    Args:
        document_meta: The document to process.

    Returns:
        The list of elements extracted from the document.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.

    """
    with trace(
        partition_arg=self.partition_kwargs,
        chunking_arg=self.chunking_kwargs,
        api_server=self.api_server,
        api_key=self.api_key,
        ignore_images=self.ignore_images,
    ) as outputs:
        self.validate_document_type(document_meta.document_type)
        document = await document_meta.fetch()

        if self.use_api:
            res = await self.client.general.partition_async(
                request={
                    "partition_parameters": {
                        "files": {
                            "content": document.local_path.read_bytes(),
                            "file_name": document.local_path.name,
                        },
                        "coordinates": True,
                        **self.partition_kwargs,
                    }
                }
            )
            elements = elements_from_dicts(res.elements)  # type: ignore
        else:
            elements = partition(
                file=BytesIO(document.local_path.read_bytes()),
                metadata_filename=document.local_path.name,
                **self.partition_kwargs,
            )

        outputs.results = await self._chunk_and_convert(elements, document_meta, document.local_path)
        return outputs.results

validate_document_type #

validate_document_type(document_type: DocumentType) -> None

Check if the provider supports the document type.

PARAMETER DESCRIPTION
document_type

The document type.

TYPE: DocumentType

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
def validate_document_type(self, document_type: DocumentType) -> None:
    """
    Check if the provider supports the document type.

    Args:
        document_type: The document type.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.
    """
    if document_type not in self.SUPPORTED_DOCUMENT_TYPES:
        raise DocumentTypeNotSupportedError(provider_name=self.__class__.__name__, document_type=document_type)

ragbits.document_search.ingestion.providers.unstructured.UnstructuredPdfProvider #

UnstructuredPdfProvider(partition_kwargs: dict | None = None, chunking_kwargs: dict | None = None, api_key: str | None = None, api_server: str | None = None, use_api: bool = False)

Bases: UnstructuredImageProvider

A specialized provider that handles pdfs using the Unstructured

Initialize the UnstructuredPdfProvider.

PARAMETER DESCRIPTION
partition_kwargs

The additional arguments for the partitioning. Refer to the Unstructured API documentation for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters

TYPE: dict | None DEFAULT: None

chunking_kwargs

The additional arguments for the chunking.

TYPE: dict | None DEFAULT: None

api_key

The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment variable will be used.

TYPE: str | None DEFAULT: None

api_server

The API server URL to use for the Unstructured API. If not specified, the UNSTRUCTURED_SERVER_URL environment variable will be used.

TYPE: str | None DEFAULT: None

use_api

Whether to use the Unstructured API. If False, the provider will only use the local processing.

TYPE: bool DEFAULT: False

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/images.py
def __init__(
    self,
    partition_kwargs: dict | None = None,
    chunking_kwargs: dict | None = None,
    api_key: str | None = None,
    api_server: str | None = None,
    use_api: bool = False,
) -> None:
    """Initialize the UnstructuredPdfProvider.

    Args:
        partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
            for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
        chunking_kwargs: The additional arguments for the chunking.
        api_key: The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment
            variable will be used.
        api_server: The API server URL to use for the Unstructured API. If not specified, the
            UNSTRUCTURED_SERVER_URL environment variable will be used.
        use_api: Whether to use the Unstructured API. If False, the provider will only use the local processing.
    """
    super().__init__(partition_kwargs, chunking_kwargs, api_key, api_server, use_api)

default_module class-attribute instance-attribute #

default_module: ClassVar = providers

configuration_key class-attribute instance-attribute #

configuration_key: ClassVar = 'provider'

partition_kwargs instance-attribute #

partition_kwargs = partition_kwargs or DEFAULT_PARTITION_KWARGS

chunking_kwargs instance-attribute #

chunking_kwargs = chunking_kwargs or DEFAULT_CHUNKING_KWARGS

api_key instance-attribute #

api_key = api_key

api_server instance-attribute #

api_server = api_server

use_api instance-attribute #

use_api = use_api

ignore_images instance-attribute #

ignore_images = ignore_images

client property #

client: UnstructuredClient

Get the UnstructuredClient instance. If the client is not initialized, it will be created.

RETURNS DESCRIPTION
UnstructuredClient

The UnstructuredClient instance.

RAISES DESCRIPTION
ValueError

If the UNSTRUCTURED_API_KEY_ENV environment variable is not set.

ValueError

If the UNSTRUCTURED_SERVER_URL_ENV environment variable is not set.

SUPPORTED_DOCUMENT_TYPES class-attribute instance-attribute #

SUPPORTED_DOCUMENT_TYPES = {PDF}

subclass_from_config classmethod #

subclass_from_config(config: ObjectContructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectContructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectContructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectContructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectContructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

process async #

process(document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]

Process the document using the Unstructured API.

PARAMETER DESCRIPTION
document_meta

The document to process.

TYPE: DocumentMeta

RETURNS DESCRIPTION
Sequence[Element | IntermediateElement]

The list of elements extracted from the document.

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/unstructured/default.py
async def process(self, document_meta: DocumentMeta) -> Sequence[Element | IntermediateElement]:
    """
    Process the document using the Unstructured API.

    Args:
        document_meta: The document to process.

    Returns:
        The list of elements extracted from the document.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.

    """
    with trace(
        partition_arg=self.partition_kwargs,
        chunking_arg=self.chunking_kwargs,
        api_server=self.api_server,
        api_key=self.api_key,
        ignore_images=self.ignore_images,
    ) as outputs:
        self.validate_document_type(document_meta.document_type)
        document = await document_meta.fetch()

        if self.use_api:
            res = await self.client.general.partition_async(
                request={
                    "partition_parameters": {
                        "files": {
                            "content": document.local_path.read_bytes(),
                            "file_name": document.local_path.name,
                        },
                        "coordinates": True,
                        **self.partition_kwargs,
                    }
                }
            )
            elements = elements_from_dicts(res.elements)  # type: ignore
        else:
            elements = partition(
                file=BytesIO(document.local_path.read_bytes()),
                metadata_filename=document.local_path.name,
                **self.partition_kwargs,
            )

        outputs.results = await self._chunk_and_convert(elements, document_meta, document.local_path)
        return outputs.results

validate_document_type #

validate_document_type(document_type: DocumentType) -> None

Check if the provider supports the document type.

PARAMETER DESCRIPTION
document_type

The document type.

TYPE: DocumentType

RAISES DESCRIPTION
DocumentTypeNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/providers/base.py
def validate_document_type(self, document_type: DocumentType) -> None:
    """
    Check if the provider supports the document type.

    Args:
        document_type: The document type.

    Raises:
        DocumentTypeNotSupportedError: If the document type is not supported.
    """
    if document_type not in self.SUPPORTED_DOCUMENT_TYPES:
        raise DocumentTypeNotSupportedError(provider_name=self.__class__.__name__, document_type=document_type)