Skip to content

Document Parsers#

ragbits.document_search.ingestion.parsers.router.DocumentParserRouter #

DocumentParserRouter(parsers: Mapping[DocumentType, DocumentParser] | None = None)

Bases: WithConstructionConfig

The class responsible for routing the document to the correct parser based on the document type.

Initialize the DocumentParserRouter instance.

PARAMETER DESCRIPTION
parsers

The mapping of document types and their parsers. To override default Unstructured parsers.

TYPE: Mapping[DocumentType, DocumentParser] | None DEFAULT: None

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py
def __init__(self, parsers: Mapping[DocumentType, DocumentParser] | None = None) -> None:
    """
    Initialize the DocumentParserRouter instance.

    Args:
        parsers: The mapping of document types and their parsers. To override default Unstructured parsers.
    """
    self._parsers = {**_DEFAULT_PARSERS, **parsers} if parsers else _DEFAULT_PARSERS

default_module class-attribute #

default_module: ModuleType | None = None

configuration_key class-attribute #

configuration_key: str = 'parser_router'

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict[str, ObjectConstructionConfig]) -> Self

Initialize the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict[str, ObjectConstructionConfig]

RETURNS DESCRIPTION
Self

The DocumentParserRouter.

RAISES DESCRIPTION
InvalidConfigError

If any of the provided parsers cannot be initialized.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py
@classmethod
def from_config(cls, config: dict[str, ObjectConstructionConfig]) -> Self:
    """
    Initialize the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        The DocumentParserRouter.

    Raises:
        InvalidConfigError: If any of the provided parsers cannot be initialized.
    """
    parsers = {
        DocumentType(document_type): DocumentParser.subclass_from_config(parser_config)
        for document_type, parser_config in config.items()
    }
    return super().from_config({"parsers": parsers})

get #

get(document_type: DocumentType) -> DocumentParser

Get the parser for the document.

PARAMETER DESCRIPTION
document_type

The document type.

TYPE: DocumentType

RETURNS DESCRIPTION
DocumentParser

The parser for processing the document.

RAISES DESCRIPTION
ParserNotFoundError

If no parser is found for the document type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/router.py
def get(self, document_type: DocumentType) -> DocumentParser:
    """
    Get the parser for the document.

    Args:
        document_type: The document type.

    Returns:
        The parser for processing the document.

    Raises:
        ParserNotFoundError: If no parser is found for the document type.
    """
    parser = self._parsers.get(document_type)

    if isinstance(parser, DocumentParser):
        return parser

    raise ParserNotFoundError(document_type)

ragbits.document_search.ingestion.parsers.base.DocumentParser #

Bases: WithConstructionConfig, ABC

Base class for document parsers, responsible for converting the document into a list of elements.

default_module class-attribute #

default_module: ModuleType | None = parsers

configuration_key class-attribute #

configuration_key: str = 'parser'

supported_document_types class-attribute instance-attribute #

supported_document_types: set[DocumentType] = set()

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

parse abstractmethod async #

parse(document: Document) -> list[Element]

Parse the document.

PARAMETER DESCRIPTION
document

The document to parse.

TYPE: Document

RETURNS DESCRIPTION
list[Element]

The list of elements extracted from the document.

RAISES DESCRIPTION
ParserError

If the parsing of the document failed.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
@abstractmethod
async def parse(self, document: Document) -> list[Element]:
    """
    Parse the document.

    Args:
        document: The document to parse.

    Returns:
        The list of elements extracted from the document.

    Raises:
        ParserError: If the parsing of the document failed.
    """

validate_document_type classmethod #

validate_document_type(document_type: DocumentType) -> None

Check if the parser supports the document type.

PARAMETER DESCRIPTION
document_type

The document type to validate against the parser.

TYPE: DocumentType

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
@classmethod
def validate_document_type(cls, document_type: DocumentType) -> None:
    """
    Check if the parser supports the document type.

    Args:
        document_type: The document type to validate against the parser.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported.
    """
    if document_type not in cls.supported_document_types:
        raise ParserDocumentNotSupportedError(parser_name=cls.__name__, document_type=document_type)

ragbits.document_search.ingestion.parsers.base.TextDocumentParser #

Bases: DocumentParser

Simple parser that maps a text to the text element.

default_module class-attribute #

default_module: ModuleType | None = parsers

configuration_key class-attribute #

configuration_key: str = 'parser'

supported_document_types class-attribute instance-attribute #

supported_document_types = {TXT, MD}

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

validate_document_type classmethod #

validate_document_type(document_type: DocumentType) -> None

Check if the parser supports the document type.

PARAMETER DESCRIPTION
document_type

The document type to validate against the parser.

TYPE: DocumentType

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
@classmethod
def validate_document_type(cls, document_type: DocumentType) -> None:
    """
    Check if the parser supports the document type.

    Args:
        document_type: The document type to validate against the parser.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported.
    """
    if document_type not in cls.supported_document_types:
        raise ParserDocumentNotSupportedError(parser_name=cls.__name__, document_type=document_type)

parse async #

parse(document: Document) -> list[Element]

Parse the document.

PARAMETER DESCRIPTION
document

The document to parse.

TYPE: Document

RETURNS DESCRIPTION
list[Element]

List with an text element with the text content.

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported by the parser.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
async def parse(self, document: Document) -> list[Element]:
    """
    Parse the document.

    Args:
        document: The document to parse.

    Returns:
        List with an text element with the text content.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported by the parser.
    """
    self.validate_document_type(document.metadata.document_type)
    return [TextElement(content=document.local_path.read_text(), document_meta=document.metadata)]

ragbits.document_search.ingestion.parsers.base.ImageDocumentParser #

Bases: DocumentParser

Simple parser that maps an image to the image element.

default_module class-attribute #

default_module: ModuleType | None = parsers

configuration_key class-attribute #

configuration_key: str = 'parser'

supported_document_types class-attribute instance-attribute #

supported_document_types = {JPG, PNG}

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

validate_document_type classmethod #

validate_document_type(document_type: DocumentType) -> None

Check if the parser supports the document type.

PARAMETER DESCRIPTION
document_type

The document type to validate against the parser.

TYPE: DocumentType

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
@classmethod
def validate_document_type(cls, document_type: DocumentType) -> None:
    """
    Check if the parser supports the document type.

    Args:
        document_type: The document type to validate against the parser.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported.
    """
    if document_type not in cls.supported_document_types:
        raise ParserDocumentNotSupportedError(parser_name=cls.__name__, document_type=document_type)

parse async #

parse(document: Document) -> list[Element]

Parse the document.

PARAMETER DESCRIPTION
document

The document to parse.

TYPE: Document

RETURNS DESCRIPTION
list[Element]

List with an image element with the image content.

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported by the parser.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
async def parse(self, document: Document) -> list[Element]:
    """
    Parse the document.

    Args:
        document: The document to parse.

    Returns:
        List with an image element with the image content.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported by the parser.
    """
    self.validate_document_type(document.metadata.document_type)
    return [ImageElement(image_bytes=document.local_path.read_bytes(), document_meta=document.metadata)]

ragbits.document_search.ingestion.parsers.unstructured.UnstructuredDocumentParser #

UnstructuredDocumentParser(partition_kwargs: dict | None = None, chunking_kwargs: dict | None = None, api_key: str | None = None, api_server: str | None = None, use_api: bool = False, ignore_images: bool = False)

Bases: DocumentParser

Parser that uses the Unstructured API or local SDK to process the documents.

Initialize the UnstructuredDocumentParser instance.

PARAMETER DESCRIPTION
partition_kwargs

The additional arguments for the partitioning. Refer to the Unstructured API documentation for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters

TYPE: dict | None DEFAULT: None

chunking_kwargs

The additional arguments for the chunking.

TYPE: dict | None DEFAULT: None

api_key

The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment variable will be used.

TYPE: str | None DEFAULT: None

api_server

The API server URL to use for the Unstructured API. If not specified, the UNSTRUCTURED_SERVER_URL environment variable will be used.

TYPE: str | None DEFAULT: None

use_api

whether to use Unstructured API, otherwise use local version of Unstructured library

TYPE: bool DEFAULT: False

ignore_images

if True images will be skipped

TYPE: bool DEFAULT: False

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/unstructured.py
def __init__(
    self,
    partition_kwargs: dict | None = None,
    chunking_kwargs: dict | None = None,
    api_key: str | None = None,
    api_server: str | None = None,
    use_api: bool = False,
    ignore_images: bool = False,
) -> None:
    """
    Initialize the UnstructuredDocumentParser instance.

    Args:
        partition_kwargs: The additional arguments for the partitioning. Refer to the Unstructured API documentation
            for the available options: https://docs.unstructured.io/api-reference/api-services/api-parameters
        chunking_kwargs: The additional arguments for the chunking.
        api_key: The API key to use for the Unstructured API. If not specified, the UNSTRUCTURED_API_KEY environment
            variable will be used.
        api_server: The API server URL to use for the Unstructured API. If not specified, the
            UNSTRUCTURED_SERVER_URL environment variable will be used.
        use_api: whether to use Unstructured API, otherwise use local version of Unstructured library
        ignore_images: if True images will be skipped
    """
    self.partition_kwargs = partition_kwargs or {}
    self.chunking_kwargs = chunking_kwargs or {}
    self.api_key = api_key or os.getenv(UNSTRUCTURED_API_KEY_ENV)
    self.api_server = api_server or os.getenv(UNSTRUCTURED_SERVER_URL_ENV)
    self.use_api = use_api
    self.ignore_images = ignore_images
    self._client = UnstructuredClient(api_key_auth=self.api_key, server_url=self.api_server)

default_module class-attribute #

default_module: ModuleType | None = parsers

configuration_key class-attribute #

configuration_key: str = 'parser'

supported_document_types class-attribute instance-attribute #

supported_document_types = {TXT, MD, PDF, DOCX, DOC, PPTX, PPT, XLSX, XLS, CSV, HTML, EPUB, ORG, ODT, RST, RTF, TSV, JSON, XML, JPG, PNG}

partition_kwargs instance-attribute #

partition_kwargs = partition_kwargs or {}

chunking_kwargs instance-attribute #

chunking_kwargs = chunking_kwargs or {}

api_key instance-attribute #

api_key = api_key or getenv(UNSTRUCTURED_API_KEY_ENV)

api_server instance-attribute #

api_server = api_server or getenv(UNSTRUCTURED_SERVER_URL_ENV)

use_api instance-attribute #

use_api = use_api

ignore_images instance-attribute #

ignore_images = ignore_images

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

validate_document_type classmethod #

validate_document_type(document_type: DocumentType) -> None

Check if the parser supports the document type.

PARAMETER DESCRIPTION
document_type

The document type to validate against the parser.

TYPE: DocumentType

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/base.py
@classmethod
def validate_document_type(cls, document_type: DocumentType) -> None:
    """
    Check if the parser supports the document type.

    Args:
        document_type: The document type to validate against the parser.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported.
    """
    if document_type not in cls.supported_document_types:
        raise ParserDocumentNotSupportedError(parser_name=cls.__name__, document_type=document_type)

parse async #

parse(document: Document) -> list[Element]

Parse the document using the Unstructured API.

PARAMETER DESCRIPTION
document

The document to parse.

TYPE: Document

RETURNS DESCRIPTION
list[Element]

The list of elements extracted from the document.

RAISES DESCRIPTION
ParserDocumentNotSupportedError

If the document type is not supported by the parser.

Source code in packages/ragbits-document-search/src/ragbits/document_search/ingestion/parsers/unstructured.py
@traceable
async def parse(self, document: Document) -> list[Element]:
    """
    Parse the document using the Unstructured API.

    Args:
        document: The document to parse.

    Returns:
        The list of elements extracted from the document.

    Raises:
        ParserDocumentNotSupportedError: If the document type is not supported by the parser.
    """
    self.validate_document_type(document.metadata.document_type)
    elements = await self._partition(document)
    return self._chunk(elements, document)