Skip to content

Documents and Elements#

ragbits.document_search.documents.document.Document #

Bases: BaseModel

An object representing a document which is downloaded and stored locally.

local_path instance-attribute #

local_path: Path

metadata instance-attribute #

metadata: DocumentMeta

from_document_meta classmethod #

from_document_meta(document_meta: DocumentMeta, local_path: Path) -> Document

Create a document from a document metadata. Based on the document type, it will return a different object.

PARAMETER DESCRIPTION
document_meta

The document metadata.

TYPE: DocumentMeta

local_path

The local path to the document.

TYPE: Path

RETURNS DESCRIPTION
Document

The document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py
@classmethod
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
    """
    Create a document from a document metadata.
    Based on the document type, it will return a different object.

    Args:
        document_meta: The document metadata.
        local_path: The local path to the document.

    Returns:
        The document.
    """
    if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
        return TextDocument(local_path=local_path, metadata=document_meta)
    return cls(local_path=local_path, metadata=document_meta)

ragbits.document_search.documents.document.DocumentType #

Bases: str, Enum

Document types that can be parsed.

MD class-attribute instance-attribute #

MD = 'md'

TXT class-attribute instance-attribute #

TXT = 'txt'

PDF class-attribute instance-attribute #

PDF = 'pdf'

CSV class-attribute instance-attribute #

CSV = 'csv'

DOC class-attribute instance-attribute #

DOC = 'doc'

DOCX class-attribute instance-attribute #

DOCX = 'docx'

HTML class-attribute instance-attribute #

HTML = 'html'

EPUB class-attribute instance-attribute #

EPUB = 'epub'

XLSX class-attribute instance-attribute #

XLSX = 'xlsx'

XLS class-attribute instance-attribute #

XLS = 'xls'

ORG class-attribute instance-attribute #

ORG = 'org'

ODT class-attribute instance-attribute #

ODT = 'odt'

PPT class-attribute instance-attribute #

PPT = 'ppt'

PPTX class-attribute instance-attribute #

PPTX = 'pptx'

RST class-attribute instance-attribute #

RST = 'rst'

RTF class-attribute instance-attribute #

RTF = 'rtf'

TSV class-attribute instance-attribute #

TSV = 'tsv'

JSON class-attribute instance-attribute #

JSON = 'json'

XML class-attribute instance-attribute #

XML = 'xml'

JPG class-attribute instance-attribute #

JPG = 'jpg'

PNG class-attribute instance-attribute #

PNG = 'png'

UNKNOWN class-attribute instance-attribute #

UNKNOWN = 'unknown'

ragbits.document_search.documents.document.DocumentMeta #

Bases: BaseModel

An object representing a document metadata.

document_type instance-attribute #

document_type: DocumentType

source instance-attribute #

source: Source

id property #

id: str

Get the document ID.

RETURNS DESCRIPTION
str

The document ID.

fetch async #

fetch() -> Document

This method fetches the document from source (potentially remote) and creates an object to interface with it. Based on the document type, it will return a different object.

RETURNS DESCRIPTION
Document

The document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py
async def fetch(self) -> "Document":
    """
    This method fetches the document from source (potentially remote) and creates an object to interface with it.
    Based on the document type, it will return a different object.

    Returns:
        The document.
    """
    local_path = await self.source.fetch()
    return Document.from_document_meta(self, local_path)

create_text_document_from_literal classmethod #

create_text_document_from_literal(content: str) -> DocumentMeta

Create a text document from a literal content.

PARAMETER DESCRIPTION
content

The content of the document.

TYPE: str

RETURNS DESCRIPTION
DocumentMeta

The document metadata.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py
@classmethod
def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
    """
    Create a text document from a literal content.

    Args:
        content: The content of the document.

    Returns:
        The document metadata.
    """
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(content.encode())

    return cls(
        document_type=DocumentType.TXT,
        source=LocalFileSource(path=Path(temp_file.name)),
    )

from_local_path classmethod #

from_local_path(local_path: Path) -> DocumentMeta

Create a document metadata from a local path.

PARAMETER DESCRIPTION
local_path

The local path to the document.

TYPE: Path

RETURNS DESCRIPTION
DocumentMeta

The document metadata.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py
@classmethod
def from_local_path(cls, local_path: Path) -> "DocumentMeta":
    """
    Create a document metadata from a local path.

    Args:
        local_path: The local path to the document.

    Returns:
        The document metadata.
    """
    return cls(
        document_type=DocumentType(local_path.suffix[1:]),
        source=LocalFileSource(path=local_path),
    )

from_source async classmethod #

from_source(source: Source) -> DocumentMeta

Create a document metadata from a source.

PARAMETER DESCRIPTION
source

The source from which the document is fetched.

TYPE: Source

RETURNS DESCRIPTION
DocumentMeta

The document metadata.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py
@classmethod
async def from_source(cls, source: Source) -> "DocumentMeta":
    """
    Create a document metadata from a source.

    Args:
        source: The source from which the document is fetched.

    Returns:
        The document metadata.
    """
    path = await source.fetch()

    return cls(
        document_type=DocumentType(path.suffix[1:]),
        source=source,
    )

ragbits.document_search.documents.element.Element #

Bases: BaseModel, ABC

An object representing an element in a document.

element_type instance-attribute #

element_type: str

document_meta instance-attribute #

document_meta: DocumentMeta

location class-attribute instance-attribute #

location: ElementLocation | None = None

id property #

id: str

Retrieve the ID of the element, primarily used to represent the element's data.

RETURNS DESCRIPTION
str

string representing element

TYPE: str

key property #

key: str | None

Get the representation of the element for embedding.

RETURNS DESCRIPTION
str | None

The representation for embedding.

text_representation abstractmethod property #

text_representation: str | None

Get the text representation of the element.

RETURNS DESCRIPTION
str | None

The text representation.

image_representation property #

image_representation: bytes | None

Get the image representation of the element.

RETURNS DESCRIPTION
bytes | None

The image representation.

get_id_components #

get_id_components() -> dict[str, str]

Creates a dictionary of key value pairs of id components

RETURNS DESCRIPTION
dict

a dictionary

TYPE: dict[str, str]

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py
def get_id_components(self) -> dict[str, str]:
    """
    Creates a dictionary of key value pairs of id components

    Returns:
        dict: a dictionary
    """
    id_components = {
        "meta": self.document_meta.id,
        "type": self.element_type,
        "key": str(self.key),
        "text": str(self.text_representation),
        "location": str(self.location),
    }
    return id_components

from_vector_db_entry classmethod #

from_vector_db_entry(db_entry: VectorStoreEntry) -> Element

Create an element from a vector database entry.

PARAMETER DESCRIPTION
db_entry

The vector database entry.

TYPE: VectorStoreEntry

RETURNS DESCRIPTION
Element

The element.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py
@classmethod
def from_vector_db_entry(cls, db_entry: VectorStoreEntry) -> "Element":
    """
    Create an element from a vector database entry.

    Args:
        db_entry: The vector database entry.

    Returns:
        The element.
    """
    element_type = db_entry.metadata["element_type"]
    element_cls = Element._elements_registry[element_type]
    if "embedding_type" in db_entry.metadata:
        del db_entry.metadata["embedding_type"]
    return element_cls(**db_entry.metadata)

to_vector_db_entry #

to_vector_db_entry() -> VectorStoreEntry

Create a vector database entry from the element.

RETURNS DESCRIPTION
VectorStoreEntry

The vector database entry

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py
def to_vector_db_entry(self) -> VectorStoreEntry:
    """
    Create a vector database entry from the element.

    Returns:
        The vector database entry
    """
    id_components = [
        self.id,
    ]
    vector_store_entry_id = uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components))
    metadata = self.model_dump(exclude={"id", "key"})
    metadata["document_meta"]["source"]["id"] = self.document_meta.source.id

    return VectorStoreEntry(
        id=vector_store_entry_id, text=self.key, image_bytes=self.image_representation, metadata=metadata
    )

ragbits.document_search.documents.sources.Source #

Bases: BaseModel, ABC

An object representing a source.

protocol class-attribute #

protocol: str | None = None

id abstractmethod property #

id: str

Get the source ID.

RETURNS DESCRIPTION
str

The source ID.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch abstractmethod async #

fetch() -> Path

Load the source.

RETURNS DESCRIPTION
Path

The path to the source.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@abstractmethod
async def fetch(self) -> Path:
    """
    Load the source.

    Returns:
        The path to the source.
    """

from_uri abstractmethod async classmethod #

from_uri(path: str) -> Sequence[Source]

Create Source instances from a URI path.

The path can contain glob patterns (asterisks) to match multiple sources, but pattern support varies by source type. Each source implementation defines which patterns it supports:

  • LocalFileSource: Supports full glob patterns ('', '*', etc.) via Path.glob
  • GCSSource: Supports simple prefix matching with '*' at the end of path
  • HuggingFaceSource: Does not support glob patterns
PARAMETER DESCRIPTION
path

The path part of the URI (after protocol://). Pattern support depends on source type.

TYPE: str

RETURNS DESCRIPTION
Sequence[Source]

A sequence of Source objects matching the path pattern

RAISES DESCRIPTION
ValueError

If the path contains unsupported pattern for this source type

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
@abstractmethod
async def from_uri(cls, path: str) -> Sequence["Source"]:
    """Create Source instances from a URI path.

    The path can contain glob patterns (asterisks) to match multiple sources, but pattern support
    varies by source type. Each source implementation defines which patterns it supports:

    - LocalFileSource: Supports full glob patterns ('*', '**', etc.) via Path.glob
    - GCSSource: Supports simple prefix matching with '*' at the end of path
    - HuggingFaceSource: Does not support glob patterns

    Args:
        path: The path part of the URI (after protocol://). Pattern support depends on source type.

    Returns:
        A sequence of Source objects matching the path pattern

    Raises:
        ValueError: If the path contains unsupported pattern for this source type
    """

ragbits.document_search.documents.sources.AzureBlobStorageSource #

Bases: Source

An object representing an Azure Blob Storage dataset source.

protocol class-attribute #

protocol: str = 'azure'

account_name instance-attribute #

account_name: str

container_name instance-attribute #

container_name: str

blob_name instance-attribute #

blob_name: str

id property #

id: str

Get the source ID, which is the full blob URL.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Downloads the blob to a temporary local file and returns the file path.

RETURNS DESCRIPTION
Path

Path to the downloaded file.

RAISES DESCRIPTION
SourceNotFoundError

If the blob source is not available.

SourceConnectionError

If the blob service connection is not available.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/azure.py
@requires_dependencies(["azure.storage.blob", "azure.core.exceptions"], "azure")
async def fetch(self) -> Path:
    """
    Downloads the blob to a temporary local file and returns the file path.

    Returns:
        Path to the downloaded file.

    Raises:
        SourceNotFoundError: If the blob source is not available.
        SourceConnectionError: If the blob service connection is not available.
    """
    container_local_dir = get_local_storage_dir() / self.account_name / self.container_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    path = container_local_dir / self.blob_name
    with trace(account_name=self.account_name, container=self.container_name, blob=self.blob_name) as outputs:
        try:
            blob_service = await self._get_blob_service(account_name=self.account_name)
            blob_client = blob_service.get_blob_client(container=self.container_name, blob=self.blob_name)
            Path(path).parent.mkdir(parents=True, exist_ok=True)
            stream = blob_client.download_blob()
            content = stream.readall()
            with open(path, "wb") as file:
                file.write(content)

        except ResourceNotFoundError as e:
            raise SourceNotFoundError(f"Blob {self.blob_name} not found in container {self.container_name}") from e
        except Exception as e:
            raise SourceConnectionError() from e
        outputs.path = path
    return path

from_uri async classmethod #

from_uri(path: str) -> Sequence[AzureBlobStorageSource]

Parses an Azure Blob Storage URI and returns an instance of AzureBlobStorageSource.

PARAMETER DESCRIPTION
path

The Azure Blob Storage URI.

TYPE: str

RETURNS DESCRIPTION
Sequence[AzureBlobStorageSource]

Sequence["AzureBlobStorageSource"]: The parsed Azure Blob Storage URI.

RAISES DESCRIPTION
ValueError

If the Azure Blob Storage URI is invalid.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/azure.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["AzureBlobStorageSource"]:
    """
    Parses an Azure Blob Storage URI and returns an instance of AzureBlobStorageSource.

    Args:
        path (str): The Azure Blob Storage URI.

    Returns:
        Sequence["AzureBlobStorageSource"]: The parsed Azure Blob Storage URI.

    Raises:
        ValueError: If the Azure Blob Storage URI is invalid.
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "AzureBlobStorageSource only supports '*' at the end of path. "
            "Patterns like '**' or '?' are not supported."
        )
    parsed = urlparse(path)
    if not parsed.netloc or not parsed.path:
        raise ValueError("Invalid Azure Blob Storage URI format.")

    if parsed.scheme != "https":
        raise ValueError("Invalid scheme, expected 'https://account_name.blob.core.windows.net'.")

    if parsed.netloc.endswith("blob.core.windows.net"):
        account_name = parsed.netloc.replace(".blob.core.windows.net", "")
    else:
        raise ValueError("Invalid scheme, expected 'https://account_name.blob.core.windows.net'.")

    path_parts = parsed.path.lstrip("/").split("/", 1)
    if len(path_parts) != 2:  # noqa PLR2004
        raise ValueError("URI must include both container and blob name.")

    container_name, blob_name = path_parts
    if "*" in blob_name:
        if not blob_name.endswith("*") or "*" in blob_name[:-1]:
            raise ValueError(
                f"AzureBlobStorageSource only supports '*' at the end of path. Invalid pattern: {blob_name}."
            )
        blob_name = blob_name[:-1]
        return await cls.list_sources(container=container_name, blob_name=blob_name, account_name=account_name)

    # Return a single-element list (consistent with other sources)
    return [cls(account_name=account_name, container_name=container_name, blob_name=blob_name)]

list_sources async classmethod #

list_sources(account_name: str, container: str, blob_name: str = '') -> list[AzureBlobStorageSource]

List all sources in the given Azure container, matching the prefix.

PARAMETER DESCRIPTION
account_name

The Azure storage account name.

TYPE: str

container

The Azure container name.

TYPE: str

blob_name

The prefix to match.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
list[AzureBlobStorageSource]

List of source objects.

RAISES DESCRIPTION
ImportError

If the required 'azure-storage-blob' package is not installed

SourceConnectionError

If there's an error connecting to Azure

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/azure.py
@classmethod
@requires_dependencies(["azure.storage.blob"], "azure")
async def list_sources(
    cls, account_name: str, container: str, blob_name: str = ""
) -> list["AzureBlobStorageSource"]:
    """List all sources in the given Azure container, matching the prefix.

    Args:
        account_name (str): The Azure storage account name.
        container: The Azure container name.
        blob_name: The prefix to match.

    Returns:
        List of source objects.

    Raises:
        ImportError: If the required 'azure-storage-blob' package is not installed
        SourceConnectionError: If there's an error connecting to Azure
    """
    with trace(account_name=account_name, container=container, blob_name=blob_name) as outputs:
        blob_service = await cls._get_blob_service(account_name=account_name)
        try:
            container_client = blob_service.get_container_client(container)
            blobs = container_client.list_blobs(name_starts_with=blob_name)
            outputs.results = [
                AzureBlobStorageSource(container_name=container, blob_name=blob.name, account_name=account_name)
                for blob in blobs
            ]
            return outputs.results
        except Exception as e:
            raise SourceConnectionError() from e

ragbits.document_search.documents.sources.GCSSource #

Bases: Source

An object representing a GCS file source.

bucket instance-attribute #

bucket: str

object_name instance-attribute #

object_name: str

protocol class-attribute #

protocol: str = 'gcs'

id property #

id: str

Get unique identifier of the object in the source.

RETURNS DESCRIPTION
str

Unique identifier.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

set_storage classmethod #

set_storage(storage: Storage | None) -> None

Set the storage client for all instances.

PARAMETER DESCRIPTION
storage

The gcloud-aio-storage Storage object to use as the storage client. By default, the object will be created automatically.

TYPE: Storage | None

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py
@classmethod
def set_storage(cls, storage: "StorageClient | None") -> None:
    """Set the storage client for all instances.

    Args:
        storage: The `gcloud-aio-storage` `Storage` object to use as the storage client.
            By default, the object will be created automatically.
    """
    cls._storage = storage

fetch async #

fetch() -> Path

Fetch the file from Google Cloud Storage and store it locally.

The file is downloaded to a local directory specified by local_dir. If the file already exists locally, it will not be downloaded again. If the file doesn't exist locally, it will be fetched from GCS. The local directory is determined by the environment variable LOCAL_STORAGE_DIR. If this environment variable is not set, a temporary directory is used.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

TYPE: Path

RAISES DESCRIPTION
ImportError

If the 'gcp' extra is not installed.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py
@traceable
@requires_dependencies(["gcloud.aio.storage"], "gcs")
async def fetch(self) -> Path:
    """
    Fetch the file from Google Cloud Storage and store it locally.

    The file is downloaded to a local directory specified by `local_dir`. If the file already exists locally,
    it will not be downloaded again. If the file doesn't exist locally, it will be fetched from GCS.
    The local directory is determined by the environment variable `LOCAL_STORAGE_DIR`. If this environment
    variable is not set, a temporary directory is used.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        ImportError: If the 'gcp' extra is not installed.
    """
    local_dir = get_local_storage_dir()
    bucket_local_dir = local_dir / self.bucket
    bucket_local_dir.mkdir(parents=True, exist_ok=True)
    path = bucket_local_dir / self.object_name
    with trace(bucket=self.bucket, object=self.object_name) as outputs:
        if not path.is_file():
            storage = await self._get_storage()
            async with storage as client:
                content = await client.download(self.bucket, self.object_name)
                Path(bucket_local_dir / self.object_name).parent.mkdir(parents=True, exist_ok=True)
                with open(path, mode="wb+") as file_object:
                    file_object.write(content)
        outputs.path = path
    return path

list_sources async classmethod #

list_sources(bucket: str, prefix: str = '') -> list[GCSSource]

List all sources in the given GCS bucket, matching the prefix.

PARAMETER DESCRIPTION
bucket

The GCS bucket.

TYPE: str

prefix

The prefix to match.

TYPE: str DEFAULT: ''

RETURNS DESCRIPTION
list[GCSSource]

List of source objects.

RAISES DESCRIPTION
ImportError

If the required 'gcloud-aio-storage' package is not installed

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py
@classmethod
@requires_dependencies(["gcloud.aio.storage"], "gcs")
async def list_sources(cls, bucket: str, prefix: str = "") -> list["GCSSource"]:
    """List all sources in the given GCS bucket, matching the prefix.

    Args:
        bucket: The GCS bucket.
        prefix: The prefix to match.

    Returns:
        List of source objects.

    Raises:
        ImportError: If the required 'gcloud-aio-storage' package is not installed
    """
    with trace() as outputs:
        async with await cls._get_storage() as storage:
            result = await storage.list_objects(bucket, params={"prefix": prefix})
            items = result.get("items", [])
            outputs.results = [
                cls(bucket=bucket, object_name=item["name"]) for item in items if not item["name"].endswith("/")
            ]
            return outputs.results

from_uri async classmethod #

from_uri(path: str) -> Sequence[GCSSource]

Create GCSSource instances from a URI path.

Supports simple prefix matching with '' at the end of path. For example: - "bucket/folder/" - matches all files in the folder - "bucket/folder/prefix*" - matches all files starting with prefix

More complex patterns like '**' or '?' are not supported.

PARAMETER DESCRIPTION
path

The path part of the URI (after gcs://). Can end with '*' for pattern matching.

TYPE: str

RETURNS DESCRIPTION
Sequence[GCSSource]

A sequence of GCSSource objects matching the pattern

RAISES DESCRIPTION
ValueError

If an unsupported pattern is used

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["GCSSource"]:
    """Create GCSSource instances from a URI path.

    Supports simple prefix matching with '*' at the end of path.
    For example:
    - "bucket/folder/*" - matches all files in the folder
    - "bucket/folder/prefix*" - matches all files starting with prefix

    More complex patterns like '**' or '?' are not supported.

    Args:
        path: The path part of the URI (after gcs://). Can end with '*' for pattern matching.

    Returns:
        A sequence of GCSSource objects matching the pattern

    Raises:
        ValueError: If an unsupported pattern is used
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "GCSSource only supports '*' at the end of path. Patterns like '**' or '?' are not supported."
        )

    # Split into bucket and prefix
    bucket, prefix = path.split("/", 1) if "/" in path else (path, "")

    if "*" in prefix:
        if not prefix.endswith("*"):
            raise ValueError(f"GCSSource only supports '*' at the end of path. Invalid pattern: {prefix}")
        # Remove the trailing * for GCS prefix listing
        prefix = prefix[:-1]
        return await cls.list_sources(bucket=bucket, prefix=prefix)

    return [cls(bucket=bucket, object_name=prefix)]

ragbits.document_search.documents.sources.GitSource #

Bases: Source

An object representing a file in a Git repository.

repo_url instance-attribute #

repo_url: str

file_path instance-attribute #

file_path: str

branch class-attribute instance-attribute #

branch: str | None = None

protocol class-attribute #

protocol: str = 'git'

id property #

id: str

Get the source ID, which is a unique identifier of the object.

RETURNS DESCRIPTION
str

The source ID.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Clone the Git repository and return the path to the specific file.

RETURNS DESCRIPTION
Path

The local path to the specific file in the cloned repository.

RAISES DESCRIPTION
SourceNotFoundError

If the repository cannot be cloned or the file doesn't exist.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/git.py
@requires_dependencies(["git"])
@traceable
async def fetch(self) -> Path:
    """
    Clone the Git repository and return the path to the specific file.

    Returns:
        The local path to the specific file in the cloned repository.

    Raises:
        SourceNotFoundError: If the repository cannot be cloned or the file doesn't exist.
    """
    repo_dir = self._get_repo_dir(self.repo_url, self.branch)
    self._ensure_repo(self.repo_url, repo_dir, self.branch)

    # Check if the file exists in the repository
    file_path = repo_dir / self.file_path
    if not file_path.exists() or not file_path.is_file():
        raise SourceNotFoundError(f"File {self.file_path} not found in repository")

    return file_path

list_sources async classmethod #

list_sources(repo_url: str, file_pattern: str = '**/*', branch: str | None = None) -> list[GitSource]

List all files in the repository matching the pattern.

PARAMETER DESCRIPTION
repo_url

URL of the git repository.

TYPE: str

file_pattern

The glob pattern to match files.

TYPE: str DEFAULT: '**/*'

branch

Optional branch name.

TYPE: str | None DEFAULT: None

RETURNS DESCRIPTION
list[GitSource]

List of GitSource objects, one for each file matching the pattern.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/git.py
@classmethod
@traceable
async def list_sources(
    cls, repo_url: str, file_pattern: str = "**/*", branch: str | None = None
) -> list["GitSource"]:
    """
    List all files in the repository matching the pattern.

    Args:
        repo_url: URL of the git repository.
        file_pattern: The glob pattern to match files.
        branch: Optional branch name.

    Returns:
        List of GitSource objects, one for each file matching the pattern.
    """
    repo_dir = cls._get_repo_dir(repo_url, branch)
    cls._ensure_repo(repo_url, repo_dir, branch)

    # Find all files matching the pattern
    matched_files = repo_dir.glob(file_pattern)
    file_sources = []

    for file_path in matched_files:
        if file_path.is_file():
            # Convert to relative path within the repository
            relative_path = file_path.relative_to(repo_dir)
            file_sources.append(cls(repo_url=repo_url, file_path=str(relative_path), branch=branch))

    return file_sources

from_uri async classmethod #

from_uri(uri: str) -> Sequence[GitSource]

Create GitSource instances from a URI path.

Supported URI formats: - git://https://github.com/username/repo.git:path/to/file.txt - git://https://github.com/username/repo.git:branch:path/to/file.txt - git@github.com:username/repo.git:path/to/file.txt - git@github.com:username/repo.git:branch:path/to/file.txt

PARAMETER DESCRIPTION
uri

The URI path in the format described above.

TYPE: str

RETURNS DESCRIPTION
Sequence[GitSource]

A sequence containing a GitSource instance.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/git.py
@classmethod
@traceable
async def from_uri(cls, uri: str) -> Sequence["GitSource"]:
    """
    Create GitSource instances from a URI path.

    Supported URI formats:
    - git://https://github.com/username/repo.git:path/to/file.txt
    - git://https://github.com/username/repo.git:branch:path/to/file.txt
    - git@github.com:username/repo.git:path/to/file.txt
    - git@github.com:username/repo.git:branch:path/to/file.txt

    Args:
        uri: The URI path in the format described above.

    Returns:
        A sequence containing a GitSource instance.
    """
    # Check if URI starts with git:// protocol
    if uri.startswith("git://"):
        uri = uri[6:]  # Remove the git:// prefix

    parts = uri.split(":")
    sources = []

    if len(parts) == _REPO_AND_FILE_PARTS:
        # Repo URL and file path
        sources.append(cls(repo_url=parts[0], file_path=parts[1]))
    elif len(parts) >= _MIN_PARTS_WITH_PROTOCOL:
        # Handle SSH format (git@github.com:username/repo.git)
        if parts[0].startswith("git@"):
            repo_url = f"{parts[0]}:{parts[1]}"  # Reconstruct full SSH URL
            file_path = parts[2] if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[3]
            branch = None if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[2]
            sources.append(cls(repo_url=repo_url, file_path=file_path, branch=branch))
        # Handle HTTPS format
        elif parts[0] in ["http", "https"]:
            repo_url = f"{parts[0]}:{parts[1]}"
            file_path = parts[2] if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[3]
            branch = None if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[2]
            sources.append(cls(repo_url=repo_url, file_path=file_path, branch=branch))
        else:
            # Repo URL, branch, and file path in standard format
            sources.append(cls(repo_url=parts[0], branch=parts[1], file_path=parts[2]))

    return sources

ragbits.document_search.documents.sources.HuggingFaceSource #

Bases: Source

An object representing a Hugging Face dataset source.

path instance-attribute #

path: str

split class-attribute instance-attribute #

split: str = 'train'

row instance-attribute #

row: int

protocol class-attribute #

protocol: str = 'huggingface'

id property #

id: str

Get unique identifier of the object in the source.

RETURNS DESCRIPTION
str

Unique identifier.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Fetch the file from Hugging Face and store it locally.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

TYPE: Path

RAISES DESCRIPTION
ImportError

If the 'huggingface' extra is not installed.

SourceConnectionError

If the source connection fails.

SourceNotFoundError

If the source document is not found.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/hf.py
@traceable
@requires_dependencies(["datasets"], "huggingface")
async def fetch(self) -> Path:
    """
    Fetch the file from Hugging Face and store it locally.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        ImportError: If the 'huggingface' extra is not installed.
        SourceConnectionError: If the source connection fails.
        SourceNotFoundError: If the source document is not found.
    """
    with trace(path=self.path, split=self.split, row=self.row) as outputs:
        try:
            dataset = load_dataset(self.path, split=self.split, streaming=True)  # type: ignore
        except ConnectionError as exc:
            raise SourceConnectionError() from exc
        except DatasetNotFoundError as exc:  # type: ignore
            raise SourceNotFoundError(source_id=self.id) from exc

        try:
            data = next(iter(dataset.skip(self.row).take(1)))  # type: ignore
        except StopIteration as exc:
            raise SourceNotFoundError(source_id=self.id) from exc

        storage_dir = get_local_storage_dir()
        source_dir = storage_dir / Path(data["source"]).parent
        source_dir.mkdir(parents=True, exist_ok=True)
        path = storage_dir / data["source"]

        if not path.is_file():
            with open(path, mode="w", encoding="utf-8") as file:
                file.write(data["content"])
        outputs.path = path
        return path

from_uri async classmethod #

from_uri(path: str) -> Sequence[HuggingFaceSource]

Create HuggingFaceSource instances from a URI path.

Pattern matching is not supported. The path must be in the format: huggingface://dataset_path/split/row

PARAMETER DESCRIPTION
path

The path part of the URI (after huggingface://)

TYPE: str

RETURNS DESCRIPTION
Sequence[HuggingFaceSource]

A sequence containing a single HuggingFaceSource

RAISES DESCRIPTION
ValueError

If the path contains patterns or has invalid format

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/hf.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["HuggingFaceSource"]:
    """Create HuggingFaceSource instances from a URI path.

    Pattern matching is not supported. The path must be in the format:
    huggingface://dataset_path/split/row

    Args:
        path: The path part of the URI (after huggingface://)

    Returns:
        A sequence containing a single HuggingFaceSource

    Raises:
        ValueError: If the path contains patterns or has invalid format
    """
    if "*" in path or "?" in path:
        raise ValueError(
            "HuggingFaceSource does not support patterns. Path must be in format: dataset_path/split/row"
        )

    try:
        dataset_path, split, row = path.split("/")
        return [cls(path=dataset_path, split=split, row=int(row))]
    except ValueError as err:
        raise ValueError("Invalid HuggingFace path format. Expected: dataset_path/split/row") from err

list_sources async classmethod #

list_sources(path: str, split: str) -> list[HuggingFaceSource]

List all sources in the given Hugging Face repository.

PARAMETER DESCRIPTION
path

Path or name of the dataset.

TYPE: str

split

Dataset split.

TYPE: str

RETURNS DESCRIPTION
list[HuggingFaceSource]

List of source objects.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/hf.py
@classmethod
@traceable
async def list_sources(cls, path: str, split: str) -> list["HuggingFaceSource"]:
    """
    List all sources in the given Hugging Face repository.

    Args:
        path: Path or name of the dataset.
        split: Dataset split.

    Returns:
        List of source objects.
    """
    sources = load_dataset(path, split=split)  # type: ignore
    cleaned_split = re.sub(r"\[.*?\]", "", split)
    return [
        cls(
            path=path,
            split=cleaned_split,
            row=row,
        )
        for row in range(len(sources))  # type: ignore
    ]

ragbits.document_search.documents.sources.LocalFileSource #

Bases: Source

An object representing a local file source.

path instance-attribute #

path: Path

protocol class-attribute #

protocol: str = 'file'

id property #

id: str

Get unique identifier of the object in the source.

RETURNS DESCRIPTION
str

Unique identifier.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Fetch the source.

RETURNS DESCRIPTION
Path

The local path to the object fetched from the source.

RAISES DESCRIPTION
SourceNotFoundError

If the source document is not found.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/local.py
@traceable
async def fetch(self) -> Path:
    """
    Fetch the source.

    Returns:
        The local path to the object fetched from the source.

    Raises:
        SourceNotFoundError: If the source document is not found.
    """
    if not self.path.is_file():
        raise SourceNotFoundError(source_id=self.id)
    return self.path

list_sources classmethod #

list_sources(path: Path, file_pattern: str = '*') -> list[LocalFileSource]

List all sources in the given directory, matching the file pattern.

PARAMETER DESCRIPTION
path

The path to the directory.

TYPE: Path

file_pattern

The file pattern to match.

TYPE: str DEFAULT: '*'

RETURNS DESCRIPTION
list[LocalFileSource]

List of source objects.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/local.py
@classmethod
@traceable
def list_sources(cls, path: Path, file_pattern: str = "*") -> list["LocalFileSource"]:
    """
    List all sources in the given directory, matching the file pattern.

    Args:
        path: The path to the directory.
        file_pattern: The file pattern to match.

    Returns:
        List of source objects.
    """
    return [cls(path=file_path) for file_path in path.glob(file_pattern)]

from_uri async classmethod #

from_uri(path: str) -> Sequence[LocalFileSource]

Create LocalFileSource instances from a URI path.

Supports full glob patterns via Path.glob: - "/.txt" - all .txt files in any subdirectory - ".py" - all Python files in the current directory - "/*" - all files in any subdirectory - '?' matches exactly one character

PARAMETER DESCRIPTION
path

The path part of the URI (after file://). Pattern support depends on source type.

TYPE: str

RETURNS DESCRIPTION
Sequence[LocalFileSource]

A sequence of LocalFileSource objects

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/local.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["LocalFileSource"]:
    """Create LocalFileSource instances from a URI path.

    Supports full glob patterns via Path.glob:
    - "**/*.txt" - all .txt files in any subdirectory
    - "*.py" - all Python files in the current directory
    - "**/*" - all files in any subdirectory
    - '?' matches exactly one character

    Args:
        path: The path part of the URI (after file://). Pattern support depends on source type.

    Returns:
        A sequence of LocalFileSource objects
    """
    path_obj: Path = Path(path)
    base_path, pattern = cls._split_path_and_pattern(path=path_obj)
    if base_path.is_file():
        return [cls(path=base_path)]
    if not pattern:
        return []
    return [cls(path=f) for f in base_path.glob(pattern) if f.is_file()]

ragbits.document_search.documents.sources.S3Source #

Bases: Source

An object representing an AWS S3 Storage dataset source.

protocol class-attribute #

protocol: str = 's3'

bucket_name instance-attribute #

bucket_name: str

key instance-attribute #

key: str

id property #

id: str

Get the source ID, which is the full URL to the file in s3.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Download a file in the given bucket_name with the given key.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

TYPE: Path

RAISES DESCRIPTION
ClientError

If the file doesn't exist or credentials are incomplete.

NoCredentialsError

If no credentials are available.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/s3.py
@requires_dependencies(["boto3"], "s3")
async def fetch(self) -> Path:
    """
    Download a file in the given bucket_name with the given key.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        ClientError: If the file doesn't exist or credentials are incomplete.
        NoCredentialsError: If no credentials are available.
    """
    if self._s3_client is None:
        self._set_client(self.bucket_name)

    if self._s3_client is None:
        raise RuntimeError("S3 client is not initialized.")

    local_dir = get_local_storage_dir()
    container_local_dir = local_dir / self.bucket_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    normalized_key = self.key.replace("/", "_")
    path = container_local_dir / normalized_key
    with trace(bucket=self.bucket_name, key=self.key) as outputs:
        try:
            self._s3_client.download_file(self.bucket_name, self.key, path)
        except ClientError as e:
            if e.response["Error"]["Code"] == "404":
                raise FileNotFoundError(f"The object does not exist: {self.key}") from e
            elif e.response["Error"]["Code"] == "403":
                raise PermissionError(f"Access denied. No permission to download: {self.key}") from e
            else:
                raise RuntimeError(f"S3 Client Error: {e}") from e
        except (NoCredentialsError, PartialCredentialsError) as e:
            raise ValueError("AWS credentials are missing or invalid.") from e
        outputs.path = path
    return path

list_sources async classmethod #

list_sources(bucket_name: str, prefix: str) -> Sequence[S3Source]

List all files under the given bucket name and with the given prefix.

PARAMETER DESCRIPTION
bucket_name

The name of the S3 bucket to use.

TYPE: str

prefix

The path to the files and prefix to look for.

TYPE: str

RETURNS DESCRIPTION
Sequence

The Sequence of AWS S3 sources.

TYPE: Sequence[S3Source]

RAISES DESCRIPTION
ClientError

If the source doesn't exist.

NoCredentialsError

If no credentials are available.

PartialCredentialsError

If credentials are incomplete.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/s3.py
@classmethod
@requires_dependencies(["boto3"], "s3")
async def list_sources(cls, bucket_name: str, prefix: str) -> Sequence["S3Source"]:
    """
    List all files under the given bucket name and with the given prefix.

    Arguments:
        bucket_name: The name of the S3 bucket to use.
        prefix: The path to the files and prefix to look for.

    Returns:
        Sequence: The Sequence of AWS S3 sources.

    Raises:
        ClientError: If the source doesn't exist.
        NoCredentialsError: If no credentials are available.
        PartialCredentialsError: If credentials are incomplete.
    """
    cls._set_client(bucket_name)
    if cls._s3_client is None:
        raise RuntimeError("S3 client is not initialized.")
    with trace(bucket=bucket_name, key=prefix) as outputs:
        try:
            aws_sources_list = []
            paginator = cls._s3_client.get_paginator("list_objects_v2")
            for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
                for obj in page.get("Contents", []):
                    key = obj["Key"]
                    aws_sources_list.append(cls(bucket_name=bucket_name, key=key))
            outputs.sources = aws_sources_list
            return aws_sources_list
        except (NoCredentialsError, PartialCredentialsError) as e:
            raise ValueError("AWS credentials are missing or incomplete. Please configure them.") from e
        except ClientError as e:
            raise RuntimeError(f"Failed to list files in bucket {bucket_name}: {e}") from e

from_uri async classmethod #

from_uri(path: str) -> Sequence[S3Source]

Create S3Source instances from a URI path. The supported paths formats are: s3:/// https://s3..amazonaws.com/ https://s3..amazonaws.com// Pattern matching is supported only with '*'.

PARAMETER DESCRIPTION
path

The URI path.

TYPE: str

RETURNS DESCRIPTION
Sequence[S3Source]

A sequence containing a S3Source instances.

RAISES DESCRIPTION
ValueError

If the path has invalid format

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/s3.py
@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["S3Source"]:
    """
    Create S3Source instances from a URI path.
    The supported paths formats are:
    s3://<bucket-name>/<key>
    https://<bucket-name>s3.<region>.amazonaws.com/<key>
    https://s3.<region>.amazonaws.com/<bucket-name>/<key>
    Pattern matching is supported only with '*'.

    Args:
        path: The URI path.

    Returns:
        A sequence containing a S3Source instances.

    Raises:
        ValueError: If the path has invalid format

    """
    if "**" in path or "?" in path:
        raise ValueError(
            "S3Source only supports '*' at the end of path. Patterns like '**' or '?' are not supported."
        )

    parsed = urlparse(path)
    if not parsed.netloc or not parsed.path:
        raise ValueError("Invalid AWS Source URI format.")
    if parsed.scheme not in {"s3", "https"}:
        raise ValueError("Invalid AWS Source URI format.")

    if parsed.scheme == "s3":
        bucket_name = parsed.netloc
        path_to_file = parsed.path.lstrip("/")
    elif parsed.scheme == "https":
        if not parsed.netloc.endswith("amazonaws.com"):
            raise ValueError("Invalid AWS Source URI format.")
        elif parsed.netloc.startswith("s3"):
            parts = parsed.path.split("/")
            bucket_name = parts[1]
            path_to_file = "/".join(parts[2:])
        else:
            bucket_name = parsed.netloc.split(".")[0]
            path_to_file = parsed.path.lstrip("/")

    else:
        raise ValueError("Invalid AWS Source URI format.")

    if "*" in path_to_file:
        if not path_to_file.endswith("*") or "*" in path_to_file[:-1]:
            raise ValueError(f"AWS Source only supports '*' at the end of path. Invalid pattern: {[path_to_file]}.")
        path_to_file = path_to_file[:-1]
        return await cls.list_sources(bucket_name=bucket_name, prefix=path_to_file)

    return [cls(bucket_name=bucket_name, key=path_to_file)]

ragbits.document_search.documents.sources.WebSource #

Bases: Source

An object representing a Web dataset source.

url instance-attribute #

url: str

headers class-attribute instance-attribute #

headers: dict[str, str] | None = None

protocol class-attribute #

protocol: str = 'https'

id property #

id: str

Get the source ID, which is an unique identifier of the object.

class_identifier classmethod #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py
@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch async #

fetch() -> Path

Download a file available in the given url.

RETURNS DESCRIPTION
Path

The local path to the downloaded file.

TYPE: Path

RAISES DESCRIPTION
WebDownloadError

If the download failed.

SourceNotFoundError

If the URL is invalid.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/web.py
@requires_dependencies(["aiohttp"])
async def fetch(self) -> Path:
    """
    Download a file available in the given url.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        WebDownloadError: If the download failed.
        SourceNotFoundError: If the URL is invalid.
    """
    parsed_url = urlparse(self.url)
    url_path, file_name = ("/" + parsed_url.netloc + parsed_url.path).rsplit("/", 1)
    normalized_url_path = re.sub(r"\W", "_", url_path) + file_name
    domain_name = parsed_url.netloc

    local_dir = get_local_storage_dir()
    container_local_dir = local_dir / domain_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    path = container_local_dir / normalized_url_path

    try:
        async with aiohttp.ClientSession() as session, session.get(self.url, headers=self.headers) as response:
            if response.ok:
                with open(path, "wb") as f:
                    async for chunk in response.content.iter_chunked(1024):
                        f.write(chunk)
            else:
                raise WebDownloadError(url=self.url, code=response.status)
    except (aiohttp.ClientError, IsADirectoryError) as e:
        raise SourceNotFoundError(self.id) from e

    return path

list_sources async classmethod #

list_sources(url: str) -> Sequence[WebSource]

List the file under the given URL.

PARAMETER DESCRIPTION
url

The URL to the file.

TYPE: str

RETURNS DESCRIPTION
Sequence

The Sequence with Web source.

TYPE: Sequence[WebSource]

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/web.py
@classmethod
async def list_sources(cls, url: str) -> Sequence["WebSource"]:
    """
    List the file under the given URL.

    Arguments:
        url: The URL to the file.

    Returns:
        Sequence: The Sequence with Web source.
    """
    return [cls(url=url)]

from_uri async classmethod #

from_uri(uri: str) -> Sequence[WebSource]

Create WebSource instances from a URI path. The supported uri format is: :////.

PARAMETER DESCRIPTION
uri

The URI path. Needs to include the protocol.

TYPE: str

RETURNS DESCRIPTION
Sequence[WebSource]

A sequence containing a WebSource instance.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/web.py
@classmethod
async def from_uri(cls, uri: str) -> Sequence["WebSource"]:
    """
    Create WebSource instances from a URI path.
    The supported uri format is:
    <protocol>://<domain>/<path>/<filename>.<file_extension>

    Args:
        uri: The URI path. Needs to include the protocol.

    Returns:
        A sequence containing a WebSource instance.
    """
    return [cls(url=uri)]