Skip to content

Document Processing#

ragbits.document_search.ingestion.document_processor.DocumentProcessorRouter #

DocumentProcessorRouter(providers: dict[DocumentType, BaseProvider])

The DocumentProcessorRouter is responsible for routing the document to the correct provider based on the document metadata such as the document type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
def __init__(self, providers: dict[DocumentType, BaseProvider]):
    self._providers = providers

from_dict_to_providers_config staticmethod #

from_dict_to_providers_config(dict_config: dict) -> ProvidersConfig

Creates ProvidersConfig from dictionary config. Example of the dictionary config: { "txt": { { "type": "UnstructuredProvider" } } }

PARAMETER DESCRIPTION
dict_config

The dictionary with configuration.

TYPE: dict

RETURNS DESCRIPTION
ProvidersConfig

ProvidersConfig object.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
@staticmethod
def from_dict_to_providers_config(dict_config: dict) -> ProvidersConfig:
    """
    Creates ProvidersConfig from dictionary config.
    Example of the dictionary config:
    {
        "txt": {
            {
                "type": "UnstructuredProvider"
            }
        }
    }

    Args:
        dict_config: The dictionary with configuration.

    Returns:
        ProvidersConfig object.
    """
    providers_config = {}

    for document_type, config in dict_config.items():
        providers_config[DocumentType(document_type)] = get_provider(config)

    return providers_config

from_config classmethod #

from_config(providers_config: ProvidersConfig | None = None) -> DocumentProcessorRouter

Create a DocumentProcessorRouter from a configuration. If the configuration is not provided, the default configuration will be used. If the configuration is provided, it will be merged with the default configuration, overriding the default values for the document types that are defined in the configuration. Example of the configuration: { DocumentType.TXT: YourCustomProviderClass(), DocumentType.PDF: UnstructuredProvider(), }

PARAMETER DESCRIPTION
providers_config

The dictionary with the providers configuration, mapping the document types to the provider class.

TYPE: ProvidersConfig | None DEFAULT: None

RETURNS DESCRIPTION
DocumentProcessorRouter

The DocumentProcessorRouter.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
@classmethod
def from_config(cls, providers_config: ProvidersConfig | None = None) -> "DocumentProcessorRouter":
    """
    Create a DocumentProcessorRouter from a configuration. If the configuration is not provided, the default
    configuration will be used. If the configuration is provided, it will be merged with the default configuration,
    overriding the default values for the document types that are defined in the configuration.
    Example of the configuration:
    {
        DocumentType.TXT: YourCustomProviderClass(),
        DocumentType.PDF: UnstructuredProvider(),
    }

    Args:
        providers_config: The dictionary with the providers configuration, mapping the document types to the
         provider class.

    Returns:
        The DocumentProcessorRouter.
    """
    config = copy.deepcopy(DEFAULT_PROVIDERS_CONFIG)
    config.update(providers_config if providers_config is not None else {})

    return cls(providers=config)

get_provider #

get_provider(document_meta: DocumentMeta) -> BaseProvider

Get the provider for the document.

PARAMETER DESCRIPTION
document_meta

The document metadata.

TYPE: DocumentMeta

RETURNS DESCRIPTION
BaseProvider

The provider for processing the document.

RAISES DESCRIPTION
ValueError

If no provider is found for the document type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/document_processor.py
def get_provider(self, document_meta: DocumentMeta) -> BaseProvider:
    """
    Get the provider for the document.

    Args:
        document_meta: The document metadata.

    Returns:
        The provider for processing the document.

    Raises:
        ValueError: If no provider is found for the document type.
    """
    provider = self._providers.get(document_meta.document_type)
    if provider is None:
        raise ValueError(f"No provider found for the document type {document_meta.document_type}")
    return provider