Skip to content

Sources#

ragbits.core.sources.base.Source #

Bases: WithConstructionConfig, BaseModel, ABC

Base class for data sources.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str

id abstractmethod property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch abstractmethod async #

fetch() -> Path

Load the source.

RETURNS DESCRIPTION
Path

The path to the source.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@abstractmethod
async def fetch(self) -> Path:
    """
    Load the source.

    Returns:
        The path to the source.
    """

list_sources abstractmethod async classmethod #

list_sources(*args: Any, **kwargs: Any) -> Iterable[Self]

List all sources from the given storage.

RETURNS DESCRIPTION
Iterable[Self]

The iterable of Source objects.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
@abstractmethod
async def list_sources(cls, *args: Any, **kwargs: Any) -> Iterable[Self]:  # noqa: ANN401
    """
    List all sources from the given storage.

    Returns:
        The iterable of Source objects.
    """

from_uri abstractmethod async classmethod #

from_uri(path: str) -> Iterable[Self]

Create Source instances from a URI path.

The path can contain glob patterns (asterisks) to match multiple sources, but pattern support varies by source type. Each source implementation defines which patterns it supports.

PARAMETER DESCRIPTION
path

The path part of the URI (after protocol://). Pattern support depends on source type.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of Source objects matching the path pattern.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
@abstractmethod
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create Source instances from a URI path.

    The path can contain glob patterns (asterisks) to match multiple sources, but pattern support
    varies by source type. Each source implementation defines which patterns it supports.

    Args:
        path: The path part of the URI (after protocol://). Pattern support depends on source type.

    Returns:
        The iterable of Source objects matching the path pattern.
    """

ragbits.core.sources.azure.AzureBlobStorageSource #

Bases: Source

Source for data stored in the Azure Blob Storage.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 'azure'

account_name instance-attribute #

account_name: str

container_name instance-attribute #

container_name: str

blob_name instance-attribute #

blob_name: str

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Downloads the blob to a temporary local file and returns the file path.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

RAISES DESCRIPTION
SourceNotFoundError

If the blob source is not available.

SourceConnectionError

If the blob service connection is not available.

Source code in packages/ragbits-core/src/ragbits/core/sources/azure.py
@requires_dependencies(["azure.storage.blob", "azure.core.exceptions"], "azure")
async def fetch(self) -> Path:
    """
    Downloads the blob to a temporary local file and returns the file path.

    Returns:
        The local path to the downloaded file.

    Raises:
        SourceNotFoundError: If the blob source is not available.
        SourceConnectionError: If the blob service connection is not available.
    """
    container_local_dir = get_local_storage_dir() / self.account_name / self.container_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    path = container_local_dir / self.blob_name
    with trace(account_name=self.account_name, container=self.container_name, blob=self.blob_name) as outputs:
        try:
            blob_service = self._get_blob_service(self.account_name)
            blob_client = blob_service.get_blob_client(container=self.container_name, blob=self.blob_name)
            Path(path).parent.mkdir(parents=True, exist_ok=True)
            stream = blob_client.download_blob()
            content = stream.readall()
            with open(path, "wb") as file:
                file.write(content)

        except ResourceNotFoundError as e:
            raise SourceNotFoundError(f"Blob {self.blob_name} not found in container {self.container_name}") from e
        except Exception as e:
            raise SourceConnectionError() from e
        outputs.path = path
    return path

list_sources async classmethod #

list_sources(account_name: str, container: str, blob_name: str = '') -> Iterable[Self]

List all sources in the given Azure container, matching the prefix.

PARAMETER DESCRIPTION
account_name

The Azure storage account name.

TYPE: str

container

The Azure container name.

TYPE: str

blob_name

The prefix to match.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the Azure Blob Storage container.

RAISES DESCRIPTION
SourceConnectionError

If there's an error connecting to Azure

Source code in packages/ragbits-core/src/ragbits/core/sources/azure.py
@classmethod
@requires_dependencies(["azure.storage.blob"], "azure")
async def list_sources(
    cls,
    account_name: str,
    container: str,
    blob_name: str = "",
) -> Iterable[Self]:
    """
    List all sources in the given Azure container, matching the prefix.

    Args:
        account_name: The Azure storage account name.
        container: The Azure container name.
        blob_name: The prefix to match.

    Returns:
        The iterable of sources from the Azure Blob Storage container.

    Raises:
        SourceConnectionError: If there's an error connecting to Azure
    """
    with trace(account_name=account_name, container=container, blob_name=blob_name) as outputs:
        try:
            blob_service = cls._get_blob_service(account_name)
            container_client = blob_service.get_container_client(container)
            blobs = container_client.list_blobs(name_starts_with=blob_name)
            outputs.results = [
                cls(container_name=container, blob_name=blob.name, account_name=account_name) for blob in blobs
            ]
            return outputs.results
        except Exception as e:
            raise SourceConnectionError() from e

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create AzureBlobStorageSource instances from a URI path.

The supported URI formats: - https://.blob.core.windows.net//

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the Azure Blob Storage container.

RAISES DESCRIPTION
ValueError

If the Azure Blob Storage URI is invalid.

Source code in packages/ragbits-core/src/ragbits/core/sources/azure.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create AzureBlobStorageSource instances from a URI path.

    The supported URI formats:
    - https://<account-name>.blob.core.windows.net/<container-name>/<blob-name>

    Args:
        path: The URI path in the format described above.

    Returns:
        The iterable of sources from the Azure Blob Storage container.

    Raises:
        ValueError: If the Azure Blob Storage URI is invalid.
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "AzureBlobStorageSource only supports '*' at the end of path. "
            "Patterns like '**' or '?' are not supported."
        )
    parsed = urlparse(path)
    if not parsed.netloc or not parsed.path:
        raise ValueError("Invalid Azure Blob Storage URI format.")

    if parsed.scheme != "https":
        raise ValueError("Invalid scheme, expected 'https://account_name.blob.core.windows.net'.")

    if parsed.netloc.endswith("blob.core.windows.net"):
        account_name = parsed.netloc.replace(".blob.core.windows.net", "")
    else:
        raise ValueError("Invalid scheme, expected 'https://account_name.blob.core.windows.net'.")

    path_parts = parsed.path.lstrip("/").split("/", 1)
    if len(path_parts) != 2:  # noqa PLR2004
        raise ValueError("URI must include both container and blob name.")

    container_name, blob_name = path_parts
    if "*" in blob_name:
        if not blob_name.endswith("*") or "*" in blob_name[:-1]:
            raise ValueError(
                f"AzureBlobStorageSource only supports '*' at the end of path. Invalid pattern: {blob_name}."
            )
        blob_name = blob_name[:-1]
        return await cls.list_sources(container=container_name, blob_name=blob_name, account_name=account_name)

    # Return a single-element list (consistent with other sources)
    return [cls(account_name=account_name, container_name=container_name, blob_name=blob_name)]

ragbits.core.sources.gcs.GCSSource #

Bases: Source

Source for data stored in the Google Cloud Storage.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 'gcs'

bucket instance-attribute #

bucket: str

object_name instance-attribute #

object_name: str

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

set_storage classmethod #

set_storage(storage: Storage | None) -> None

Set the storage client for all instances.

Source code in packages/ragbits-core/src/ragbits/core/sources/gcs.py
@classmethod
def set_storage(cls, storage: "StorageClient | None") -> None:
    """
    Set the storage client for all instances.
    """
    cls._storage = storage

fetch async #

fetch() -> Path

Fetch the file from Google Cloud Storage and store it locally.

The file is downloaded to a local directory specified by local_dir. If the file already exists locally, it will not be downloaded again. If the file doesn't exist locally, it will be fetched from GCS. The local directory is determined by the environment variable LOCAL_STORAGE_DIR. If this environment variable is not set, a temporary directory is used.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

Source code in packages/ragbits-core/src/ragbits/core/sources/gcs.py
@traceable
@requires_dependencies(["gcloud.aio.storage"], "gcs")
async def fetch(self) -> Path:
    """
    Fetch the file from Google Cloud Storage and store it locally.

    The file is downloaded to a local directory specified by `local_dir`. If the file already exists locally,
    it will not be downloaded again. If the file doesn't exist locally, it will be fetched from GCS.
    The local directory is determined by the environment variable `LOCAL_STORAGE_DIR`. If this environment
    variable is not set, a temporary directory is used.

    Returns:
        The local path to the downloaded file.
    """
    local_dir = get_local_storage_dir()
    bucket_local_dir = local_dir / self.bucket
    bucket_local_dir.mkdir(parents=True, exist_ok=True)
    path = bucket_local_dir / self.object_name
    with trace(bucket=self.bucket, object=self.object_name) as outputs:
        if not path.is_file():
            storage = await self._get_storage()
            async with storage as client:
                content = await client.download(self.bucket, self.object_name)
                Path(bucket_local_dir / self.object_name).parent.mkdir(parents=True, exist_ok=True)
                with open(path, mode="wb+") as file_object:
                    file_object.write(content)
        outputs.path = path
    return path

list_sources async classmethod #

list_sources(bucket: str, prefix: str = '') -> Iterable[Self]

List all sources in the given GCS bucket, matching the prefix.

PARAMETER DESCRIPTION
bucket

The GCS bucket.

TYPE: str

prefix

The prefix to match.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the GCS bucket.

Source code in packages/ragbits-core/src/ragbits/core/sources/gcs.py
@classmethod
@requires_dependencies(["gcloud.aio.storage"], "gcs")
async def list_sources(cls, bucket: str, prefix: str = "") -> Iterable[Self]:
    """
    List all sources in the given GCS bucket, matching the prefix.

    Args:
        bucket: The GCS bucket.
        prefix: The prefix to match.

    Returns:
        The iterable of sources from the GCS bucket.
    """
    with trace() as outputs:
        async with await cls._get_storage() as storage:
            result = await storage.list_objects(bucket, params={"prefix": prefix})
            items = result.get("items", [])
            outputs.results = [
                cls(bucket=bucket, object_name=item["name"]) for item in items if not item["name"].endswith("/")
            ]
            return outputs.results

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create GCSSource instances from a URI path.

The supported URI formats: - //" - matches all files in the folder - //" - matches all files starting with prefix

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the GCS bucket.

RAISES DESCRIPTION
ValueError

If an unsupported pattern is used

Source code in packages/ragbits-core/src/ragbits/core/sources/gcs.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create GCSSource instances from a URI path.

    The supported URI formats:
    - <bucket>/<folder>/*" - matches all files in the folder
    - <bucket>/<folder>/<prefix>*" - matches all files starting with prefix

    Args:
        path: The URI path in the format described above.

    Returns:
        The iterable of sources from the GCS bucket.

    Raises:
        ValueError: If an unsupported pattern is used
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "GCSSource only supports '*' at the end of path. Patterns like '**' or '?' are not supported."
        )

    # Split into bucket and prefix
    bucket, prefix = path.split("/", 1) if "/" in path else (path, "")

    if "*" in prefix:
        if not prefix.endswith("*"):
            raise ValueError(f"GCSSource only supports '*' at the end of path. Invalid pattern: {prefix}")
        # Remove the trailing * for GCS prefix listing
        prefix = prefix[:-1]
        return await cls.list_sources(bucket=bucket, prefix=prefix)

    return [cls(bucket=bucket, object_name=prefix)]

ragbits.core.sources.git.GitSource #

Bases: Source

Source for data stored in the Git repository.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 'git'

repo_url instance-attribute #

repo_url: str

file_path instance-attribute #

file_path: str

branch class-attribute instance-attribute #

branch: str | None = None

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Clone the Git repository and return the path to the specific file.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

RAISES DESCRIPTION
SourceNotFoundError

If the repository cannot be cloned or the file doesn't exist.

Source code in packages/ragbits-core/src/ragbits/core/sources/git.py
@requires_dependencies(["git"])
@traceable
async def fetch(self) -> Path:
    """
    Clone the Git repository and return the path to the specific file.

    Returns:
        The local path to the downloaded file.

    Raises:
        SourceNotFoundError: If the repository cannot be cloned or the file doesn't exist.
    """
    repo_dir = self._get_repo_dir(self.repo_url, self.branch)
    self._ensure_repo(self.repo_url, repo_dir, self.branch)

    # Check if the file exists in the repository
    file_path = repo_dir / self.file_path
    if not file_path.exists() or not file_path.is_file():
        raise SourceNotFoundError(f"File {self.file_path} not found in repository")

    return file_path

list_sources async classmethod #

list_sources(repo_url: str, file_pattern: str = '**/*', branch: str | None = None) -> Iterable[Self]

List all files in the repository matching the pattern.

PARAMETER DESCRIPTION
repo_url

URL of the git repository.

TYPE: str

file_pattern

The glob pattern to match files.

TYPE: str DEFAULT: '**/*'

branch

Optional branch name.

TYPE: str | None DEFAULT: None

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the git repository.

Source code in packages/ragbits-core/src/ragbits/core/sources/git.py
@classmethod
@traceable
async def list_sources(cls, repo_url: str, file_pattern: str = "**/*", branch: str | None = None) -> Iterable[Self]:
    """
    List all files in the repository matching the pattern.

    Args:
        repo_url: URL of the git repository.
        file_pattern: The glob pattern to match files.
        branch: Optional branch name.

    Returns:
        The iterable of sources from the git repository.
    """
    repo_dir = cls._get_repo_dir(repo_url, branch)
    cls._ensure_repo(repo_url, repo_dir, branch)

    # Find all files matching the pattern
    matched_files = repo_dir.glob(file_pattern)
    file_sources = []

    for file_path in matched_files:
        if file_path.is_file():
            # Convert to relative path within the repository
            relative_path = file_path.relative_to(repo_dir)
            file_sources.append(cls(repo_url=repo_url, file_path=str(relative_path), branch=branch))

    return file_sources

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create GitSource instances from a URI path.

Supported URI formats: - git://https://github.com/username/repo.git:path/to/file.txt - git://https://github.com/username/repo.git:branch:path/to/file.txt - git@github.com:username/repo.git:path/to/file.txt - git@github.com:username/repo.git:branch:path/to/file.txt

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the git repository.

Source code in packages/ragbits-core/src/ragbits/core/sources/git.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create GitSource instances from a URI path.

    Supported URI formats:
    - git://https://github.com/username/repo.git:path/to/file.txt
    - git://https://github.com/username/repo.git:branch:path/to/file.txt
    - git@github.com:username/repo.git:path/to/file.txt
    - git@github.com:username/repo.git:branch:path/to/file.txt

    Args:
        path: The URI path in the format described above.

    Returns:
        The iterable of sources from the git repository.
    """
    # Check if URI starts with git:// protocol
    if path.startswith("git://"):
        path = path[6:]  # Remove the git:// prefix

    parts = path.split(":")
    sources = []

    if len(parts) == _REPO_AND_FILE_PARTS:
        # Repo URL and file path
        sources.append(cls(repo_url=parts[0], file_path=parts[1]))
    elif len(parts) >= _MIN_PARTS_WITH_PROTOCOL:
        # Handle SSH format (git@github.com:username/repo.git)
        if parts[0].startswith("git@"):
            repo_url = f"{parts[0]}:{parts[1]}"  # Reconstruct full SSH URL
            file_path = parts[2] if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[3]
            branch = None if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[2]
            sources.append(cls(repo_url=repo_url, file_path=file_path, branch=branch))
        # Handle HTTPS format
        elif parts[0] in ["http", "https"]:
            repo_url = f"{parts[0]}:{parts[1]}"
            file_path = parts[2] if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[3]
            branch = None if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[2]
            sources.append(cls(repo_url=repo_url, file_path=file_path, branch=branch))
        else:
            # Repo URL, branch, and file path in standard format
            sources.append(cls(repo_url=parts[0], branch=parts[1], file_path=parts[2]))

    return sources

ragbits.core.sources.hf.HuggingFaceSource #

Bases: Source

Source for data stored in the Hugging Face repository.

Supports two formats: 1. Complete dataset: When no row is specified, downloads the entire dataset. Used for QA datasets. 2. Single row: When a specific row is specified, downloads only that row. Used for document datasets (requires "content" and "source" columns).

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 'hf'

path instance-attribute #

path: str

split class-attribute instance-attribute #

split: str = 'train'

row class-attribute instance-attribute #

row: int | None = None

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Fetch the file from Hugging Face and store it locally.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

RAISES DESCRIPTION
SourceConnectionError

If the source connection fails.

SourceNotFoundError

If the source document is not found.

Source code in packages/ragbits-core/src/ragbits/core/sources/hf.py
@requires_dependencies(["datasets"], "hf")
async def fetch(self) -> Path:
    """
    Fetch the file from Hugging Face and store it locally.

    Returns:
        The local path to the downloaded file.

    Raises:
        SourceConnectionError: If the source connection fails.
        SourceNotFoundError: If the source document is not found.
    """
    with trace(path=self.path, split=self.split, row=self.row) as outputs:
        if self.row is not None:
            try:
                dataset = load_dataset(self.path, split=self.split, streaming=True)
            except ConnectionError as exc:
                raise SourceConnectionError() from exc
            except DatasetNotFoundError as exc:
                raise SourceNotFoundError(source_id=self.id) from exc

            try:
                data = next(iter(dataset.skip(self.row).take(1)))
            except StopIteration as exc:
                raise SourceNotFoundError(source_id=self.id) from exc

            storage_dir = get_local_storage_dir()
            source_dir = storage_dir / Path(data["source"]).parent
            source_dir.mkdir(parents=True, exist_ok=True)
            path = storage_dir / data["source"]

            if not path.is_file():
                with open(path, mode="w", encoding="utf-8") as file:
                    file.write(data["content"])
            outputs.path = path
        else:
            storage_dir = get_local_storage_dir()
            source_dir = storage_dir / self.path
            source_dir.mkdir(parents=True, exist_ok=True)
            path = source_dir / f"{self.split}.json"

            if not path.is_file():
                try:
                    dataset = load_dataset(self.path, split=self.split)
                except ConnectionError as exc:
                    raise SourceConnectionError() from exc
                except DatasetNotFoundError as exc:
                    raise SourceNotFoundError(source_id=self.id) from exc

                dataset.to_json(path)
            outputs.path = path

    return outputs.path

list_sources async classmethod #

list_sources(path: str, split: str) -> Iterable[Self]

List all sources in the Hugging Face repository.

PARAMETER DESCRIPTION
path

Path or name of the dataset.

TYPE: str

split

Dataset split.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the Hugging Face repository.

Source code in packages/ragbits-core/src/ragbits/core/sources/hf.py
@classmethod
@traceable
async def list_sources(cls, path: str, split: str) -> Iterable[Self]:
    """
    List all sources in the Hugging Face repository.

    Args:
        path: Path or name of the dataset.
        split: Dataset split.

    Returns:
        The iterable of sources from the Hugging Face repository.
    """
    sources = load_dataset(path, split=split)
    cleaned_split = re.sub(r"\[.*?\]", "", split)
    return [
        cls(
            path=path,
            split=cleaned_split,
            row=row,
        )
        for row in range(len(sources))
    ]

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create HuggingFaceSource instances from a URI path.

The supported URI formats: - //

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the Hugging Face repository.

RAISES DESCRIPTION
ValueError

If the path contains patterns or has invalid format.

Source code in packages/ragbits-core/src/ragbits/core/sources/hf.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create HuggingFaceSource instances from a URI path.

    The supported URI formats:
    - <dataset-path>/<split>/<row>

    Args:
        path: The URI path in the format described above.

    Returns:
       The iterable of sources from the Hugging Face repository.

    Raises:
        ValueError: If the path contains patterns or has invalid format.
    """
    if "*" in path or "?" in path:
        raise ValueError(
            "HuggingFaceSource does not support patterns. Path must be in format: dataset_path/split/row"
        )

    try:
        dataset_path, split, row = path.split("/")
        return [cls(path=dataset_path, split=split, row=int(row))]
    except ValueError as err:
        raise ValueError("Invalid HuggingFace path format. Expected: dataset_path/split/row") from err

ragbits.core.sources.local.LocalFileSource #

Bases: Source

Source for data stored on the local disk.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 'local'

path instance-attribute #

path: Path

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Fetch the source.

RETURNS DESCRIPTION
Path

The local path to the file.

RAISES DESCRIPTION
SourceNotFoundError

If the source document is not found.

Source code in packages/ragbits-core/src/ragbits/core/sources/local.py
@traceable
async def fetch(self) -> Path:
    """
    Fetch the source.

    Returns:
        The local path to the file.

    Raises:
        SourceNotFoundError: If the source document is not found.
    """
    if not self.path.is_file():
        raise SourceNotFoundError(source_id=self.id)
    return self.path

list_sources async classmethod #

list_sources(path: Path, file_pattern: str = '*') -> Iterable[Self]

List all sources in the given directory, matching the file pattern.

PARAMETER DESCRIPTION
path

The path to the directory.

TYPE: Path

file_pattern

The file pattern to match.

TYPE: str DEFAULT: '*'

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the local file system.

Source code in packages/ragbits-core/src/ragbits/core/sources/local.py
@classmethod
@traceable
async def list_sources(cls, path: Path, file_pattern: str = "*") -> Iterable[Self]:
    """
    List all sources in the given directory, matching the file pattern.

    Args:
        path: The path to the directory.
        file_pattern: The file pattern to match.

    Returns:
        The iterable of sources from the local file system.
    """
    return [cls(path=file_path) for file_path in path.glob(file_pattern)]

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create LocalFileSource instances from a URI path.

The supported URI formats: - "/.txt" - all .txt files in any subdirectory - ".py" - all Python files in the current directory - "/*" - all files in any subdirectory - '?' matches exactly one character

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the local file system.

Source code in packages/ragbits-core/src/ragbits/core/sources/local.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create LocalFileSource instances from a URI path.

    The supported URI formats:
    - "**/*.txt" - all .txt files in any subdirectory
    - "*.py" - all Python files in the current directory
    - "**/*" - all files in any subdirectory
    - '?' matches exactly one character

    Args:
        path: The URI path in the format described above.

    Returns:
        The iterable of sources from the local file system.
    """
    path_obj: Path = Path(path)
    base_path, pattern = cls._split_path_and_pattern(path=path_obj)
    if base_path.is_file():
        return [cls(path=base_path)]
    if not pattern:
        return []
    return [cls(path=f) for f in base_path.glob(pattern) if f.is_file()]

ragbits.core.sources.s3.S3Source #

Bases: Source

Source for data stored in the AWS S3 bucket.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 's3'

bucket_name instance-attribute #

bucket_name: str

key instance-attribute #

key: str

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Download a file in the given bucket name and key.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

RAISES DESCRIPTION
ClientError

If the file doesn't exist or credentials are incomplete.

NoCredentialsError

If no credentials are available.

Source code in packages/ragbits-core/src/ragbits/core/sources/s3.py
@requires_dependencies(["boto3"], "s3")
async def fetch(self) -> Path:
    """
    Download a file in the given bucket name and key.

    Returns:
        The local path to the downloaded file.

    Raises:
        ClientError: If the file doesn't exist or credentials are incomplete.
        NoCredentialsError: If no credentials are available.
    """
    if self._s3_client is None:
        self._set_client(self.bucket_name)

    if self._s3_client is None:
        raise RuntimeError("S3 client is not initialized.")

    local_dir = get_local_storage_dir()
    container_local_dir = local_dir / self.bucket_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    normalized_key = self.key.replace("/", "_")
    path = container_local_dir / normalized_key
    with trace(bucket=self.bucket_name, key=self.key) as outputs:
        try:
            self._s3_client.download_file(self.bucket_name, self.key, path)
        except ClientError as e:
            if e.response["Error"]["Code"] == "404":
                raise FileNotFoundError(f"The object does not exist: {self.key}") from e
            elif e.response["Error"]["Code"] == "403":
                raise PermissionError(f"Access denied. No permission to download: {self.key}") from e
            else:
                raise RuntimeError(f"S3 Client Error: {e}") from e
        except (NoCredentialsError, PartialCredentialsError) as e:
            raise ValueError("AWS credentials are missing or invalid.") from e
        outputs.path = path
    return path

list_sources async classmethod #

list_sources(bucket_name: str, prefix: str) -> Iterable[Self]

List all files under the given bucket name and with the given prefix.

PARAMETER DESCRIPTION
bucket_name

The name of the S3 bucket to use.

TYPE: str

prefix

The path to the files and prefix to look for.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the S3 bucket.

RAISES DESCRIPTION
ClientError

If the source doesn't exist.

NoCredentialsError

If no credentials are available.

PartialCredentialsError

If credentials are incomplete.

Source code in packages/ragbits-core/src/ragbits/core/sources/s3.py
@classmethod
@requires_dependencies(["boto3"], "s3")
async def list_sources(cls, bucket_name: str, prefix: str) -> Iterable[Self]:
    """
    List all files under the given bucket name and with the given prefix.

    Args:
        bucket_name: The name of the S3 bucket to use.
        prefix: The path to the files and prefix to look for.

    Returns:
        The iterable of sources from the S3 bucket.

    Raises:
        ClientError: If the source doesn't exist.
        NoCredentialsError: If no credentials are available.
        PartialCredentialsError: If credentials are incomplete.
    """
    cls._set_client(bucket_name)
    if cls._s3_client is None:
        raise RuntimeError("S3 client is not initialized.")
    with trace(bucket=bucket_name, key=prefix) as outputs:
        try:
            aws_sources_list = []
            paginator = cls._s3_client.get_paginator("list_objects_v2")
            for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
                for obj in page.get("Contents", []):
                    key = obj["Key"]
                    aws_sources_list.append(cls(bucket_name=bucket_name, key=key))
            outputs.sources = aws_sources_list
            return aws_sources_list
        except (NoCredentialsError, PartialCredentialsError) as e:
            raise ValueError("AWS credentials are missing or incomplete. Please configure them.") from e
        except ClientError as e:
            raise RuntimeError(f"Failed to list files in bucket {bucket_name}: {e}") from e

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create S3Source instances from a URI path.

The supported URI formats: - s3:/// - https://s3..amazonaws.com/ - https://s3..amazonaws.com//

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the S3 bucket.

RAISES DESCRIPTION
ValueError

If the path has invalid format

Source code in packages/ragbits-core/src/ragbits/core/sources/s3.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create S3Source instances from a URI path.

    The supported URI formats:
    - s3://<bucket-name>/<key>
    - https://<bucket-name>s3.<region>.amazonaws.com/<key>
    - https://s3.<region>.amazonaws.com/<bucket-name>/<key>

    Args:
        path: The URI path in the format described above.

    Returns:
        The iterable of sources from the S3 bucket.

    Raises:
        ValueError: If the path has invalid format
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "S3Source only supports '*' at the end of path. Patterns like '**' or '?' are not supported."
        )

    parsed = urlparse(path)
    if not parsed.netloc or not parsed.path:
        raise ValueError("Invalid AWS Source URI format.")
    if parsed.scheme not in {"s3", "https"}:
        raise ValueError("Invalid AWS Source URI format.")

    if parsed.scheme == "s3":
        bucket_name = parsed.netloc
        path_to_file = parsed.path.lstrip("/")
    elif parsed.scheme == "https":
        if not parsed.netloc.endswith("amazonaws.com"):
            raise ValueError("Invalid AWS Source URI format.")
        elif parsed.netloc.startswith("s3"):
            parts = parsed.path.split("/")
            bucket_name = parts[1]
            path_to_file = "/".join(parts[2:])
        else:
            bucket_name = parsed.netloc.split(".")[0]
            path_to_file = parsed.path.lstrip("/")

    else:
        raise ValueError("Invalid AWS Source URI format.")

    if "*" in path_to_file:
        if not path_to_file.endswith("*") or "*" in path_to_file[:-1]:
            raise ValueError(f"AWS Source only supports '*' at the end of path. Invalid pattern: {[path_to_file]}.")
        path_to_file = path_to_file[:-1]
        return await cls.list_sources(bucket_name=bucket_name, prefix=path_to_file)

    return [cls(bucket_name=bucket_name, key=path_to_file)]

ragbits.core.sources.web.WebSource #

Bases: Source

Source for data stored in the web.

default_module class-attribute #

default_module: ModuleType | None = sources

configuration_key class-attribute #

configuration_key: str = 'source'

protocol class-attribute #

protocol: str = 'web'

url instance-attribute #

url: str

headers class-attribute instance-attribute #

headers: dict[str, str] | None = None

id property #

id: str

Get the source identifier.

subclass_from_config classmethod #

subclass_from_config(config: ObjectConstructionConfig) -> Self

Initializes the class with the provided configuration. May return a subclass of the class, if requested by the configuration.

PARAMETER DESCRIPTION
config

A model containing configuration details for the class.

TYPE: ObjectConstructionConfig

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

RAISES DESCRIPTION
InvalidConfigError

The class can't be found or is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_config(cls, config: ObjectConstructionConfig) -> Self:
    """
    Initializes the class with the provided configuration. May return a subclass of the class,
    if requested by the configuration.

    Args:
        config: A model containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.

    Raises:
        InvalidConfigError: The class can't be found or is not a subclass of the current class.
    """
    subclass = import_by_path(config.type, cls.default_module)
    if not issubclass(subclass, cls):
        raise InvalidConfigError(f"{subclass} is not a subclass of {cls}")

    return subclass.from_config(config.config)

subclass_from_factory classmethod #

subclass_from_factory(factory_path: str) -> Self

Creates the class using the provided factory function. May return a subclass of the class, if requested by the factory.

PARAMETER DESCRIPTION
factory_path

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided factory function.

RAISES DESCRIPTION
InvalidConfigError

The factory can't be found or the object returned is not a subclass of the current class.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def subclass_from_factory(cls, factory_path: str) -> Self:
    """
    Creates the class using the provided factory function. May return a subclass of the class,
    if requested by the factory.

    Args:
        factory_path: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".

    Returns:
        An instance of the class initialized with the provided factory function.

    Raises:
        InvalidConfigError: The factory can't be found or the object returned
            is not a subclass of the current class.
    """
    factory = import_by_path(factory_path, cls.default_module)
    obj = factory()
    if not isinstance(obj, cls):
        raise InvalidConfigError(f"The object returned by factory {factory_path} is not an instance of {cls}")
    return obj

preferred_subclass classmethod #

preferred_subclass(config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None) -> Self

Tries to create an instance by looking at project's component preferences, either from YAML or from the factory. Takes optional overrides for both, which takes a higher precedence.

PARAMETER DESCRIPTION
config

The CoreConfig instance containing preferred factory and configuration details.

TYPE: CoreConfig

factory_path_override

A string representing the path to the factory function in the format of "module.submodule:factory_name".

TYPE: str | None DEFAULT: None

yaml_path_override

A string representing the path to the YAML file containing the Ragstack instance configuration.

TYPE: Path | None DEFAULT: None

RAISES DESCRIPTION
InvalidConfigError

If the default factory or configuration can't be found.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def preferred_subclass(
    cls, config: CoreConfig, factory_path_override: str | None = None, yaml_path_override: Path | None = None
) -> Self:
    """
    Tries to create an instance by looking at project's component preferences, either from YAML
    or from the factory. Takes optional overrides for both, which takes a higher precedence.

    Args:
        config: The CoreConfig instance containing preferred factory and configuration details.
        factory_path_override: A string representing the path to the factory function
            in the format of "module.submodule:factory_name".
        yaml_path_override: A string representing the path to the YAML file containing
            the Ragstack instance configuration.

    Raises:
        InvalidConfigError: If the default factory or configuration can't be found.
    """
    if yaml_path_override:
        preferences = get_config_from_yaml(yaml_path_override)
        if type_config := preferences.get(cls.configuration_key):
            return cls.subclass_from_config(ObjectConstructionConfig.model_validate(type_config))

    if factory_path_override:
        return cls.subclass_from_factory(factory_path_override)

    if preferred_factory := config.component_preference_factories.get(cls.configuration_key):
        return cls.subclass_from_factory(preferred_factory)

    if preferred_config := config.preferred_instances_config.get(cls.configuration_key):
        return cls.subclass_from_config(ObjectConstructionConfig.model_validate(preferred_config))

    raise NoPreferredConfigError(f"Could not find preferred factory or configuration for {cls.configuration_key}")

from_config classmethod #

from_config(config: dict) -> Self

Initializes the class with the provided configuration.

PARAMETER DESCRIPTION
config

A dictionary containing configuration details for the class.

TYPE: dict

RETURNS DESCRIPTION
Self

An instance of the class initialized with the provided configuration.

Source code in packages/ragbits-core/src/ragbits/core/utils/config_handling.py
@classmethod
def from_config(cls, config: dict) -> Self:
    """
    Initializes the class with the provided configuration.

    Args:
        config: A dictionary containing configuration details for the class.

    Returns:
        An instance of the class initialized with the provided configuration.
    """
    return cls(**config)

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-core/src/ragbits/core/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Download a file available in the given url.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

RAISES DESCRIPTION
SourceDownloadError

If the download failed.

SourceNotFoundError

If the URL is invalid.

Source code in packages/ragbits-core/src/ragbits/core/sources/web.py
async def fetch(self) -> Path:
    """
    Download a file available in the given url.

    Returns:
        The local path to the downloaded file.

    Raises:
        SourceDownloadError: If the download failed.
        SourceNotFoundError: If the URL is invalid.
    """
    parsed_url = urlparse(self.url)
    url_path, file_name = ("/" + parsed_url.netloc + parsed_url.path).rsplit("/", 1)
    normalized_url_path = re.sub(r"\W", "_", url_path) + file_name
    domain_name = parsed_url.netloc

    local_dir = get_local_storage_dir()
    container_local_dir = local_dir / domain_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    path = container_local_dir / normalized_url_path

    try:
        async with aiohttp.ClientSession() as session, session.get(self.url, headers=self.headers) as response:
            if response.ok:
                with open(path, "wb") as f:
                    async for chunk in response.content.iter_chunked(1024):
                        f.write(chunk)
            else:
                raise SourceDownloadError(url=self.url, code=response.status)
    except (aiohttp.ClientError, IsADirectoryError) as e:
        raise SourceNotFoundError(self.id) from e

    return path

list_sources async classmethod #

list_sources(url: str) -> Iterable[Self]

List the file under the given URL.

PARAMETER DESCRIPTION
url

The URL to the file.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the web.

Source code in packages/ragbits-core/src/ragbits/core/sources/web.py
@classmethod
async def list_sources(cls, url: str) -> Iterable[Self]:
    """
    List the file under the given URL.

    Args:
        url: The URL to the file.

    Returns:
        The iterable of sources from the web.
    """
    return [cls(url=url)]

from_uri async classmethod #

from_uri(path: str) -> Iterable[Self]

Create WebSource instances from a URI path.

The supported URI formats: - :////.

PARAMETER DESCRIPTION
path

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Iterable[Self]

The iterable of sources from the web.

Source code in packages/ragbits-core/src/ragbits/core/sources/web.py
@classmethod
async def from_uri(cls, path: str) -> Iterable[Self]:
    """
    Create WebSource instances from a URI path.

    The supported URI formats:
    - <protocol>://<domain>/<path>/<filename>.<file_extension>

    Args:
        path: The URI path in the format described above.

    Returns:
        The iterable of sources from the web.
    """
    return [cls(url=path)]