Documents and Elements#

ragbits.document_search.documents.document.Document #

Bases: BaseModel

An object representing a document which is downloaded and stored locally.

local_path `instance-attribute` #

local_path: Path

metadata `instance-attribute` #

metadata: DocumentMeta

from_document_meta `classmethod` #

from_document_meta(document_meta: DocumentMeta, local_path: Path) -> Document

Create a document from a document metadata. Based on the document type, it will return a different object.

PARAMETER	DESCRIPTION
`document_meta`	The document metadata. TYPE: `DocumentMeta`
`local_path`	The local path to the document. TYPE: `Path`

RETURNS	DESCRIPTION
`Document`	The document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py

@classmethod
def from_document_meta(cls, document_meta: DocumentMeta, local_path: Path) -> "Document":
    """
    Create a document from a document metadata.
    Based on the document type, it will return a different object.

    Args:
        document_meta: The document metadata.
        local_path: The local path to the document.

    Returns:
        The document.
    """
    if document_meta.document_type in [DocumentType.MD, DocumentType.TXT]:
        return TextDocument(local_path=local_path, metadata=document_meta)
    return cls(local_path=local_path, metadata=document_meta)

ragbits.document_search.documents.document.DocumentType #

Bases: str, Enum

Document types that can be parsed.

MD `class-attribute` `instance-attribute` #

MD = 'md'

TXT `class-attribute` `instance-attribute` #

TXT = 'txt'

PDF `class-attribute` `instance-attribute` #

PDF = 'pdf'

CSV `class-attribute` `instance-attribute` #

CSV = 'csv'

DOC `class-attribute` `instance-attribute` #

DOC = 'doc'

DOCX `class-attribute` `instance-attribute` #

DOCX = 'docx'

HTML `class-attribute` `instance-attribute` #

HTML = 'html'

EPUB `class-attribute` `instance-attribute` #

EPUB = 'epub'

XLSX `class-attribute` `instance-attribute` #

XLSX = 'xlsx'

XLS `class-attribute` `instance-attribute` #

XLS = 'xls'

ORG `class-attribute` `instance-attribute` #

ORG = 'org'

ODT `class-attribute` `instance-attribute` #

ODT = 'odt'

PPT `class-attribute` `instance-attribute` #

PPT = 'ppt'

PPTX `class-attribute` `instance-attribute` #

PPTX = 'pptx'

RST `class-attribute` `instance-attribute` #

RST = 'rst'

RTF `class-attribute` `instance-attribute` #

RTF = 'rtf'

TSV `class-attribute` `instance-attribute` #

TSV = 'tsv'

JSON `class-attribute` `instance-attribute` #

JSON = 'json'

XML `class-attribute` `instance-attribute` #

XML = 'xml'

JPG `class-attribute` `instance-attribute` #

JPG = 'jpg'

PNG `class-attribute` `instance-attribute` #

PNG = 'png'

UNKNOWN `class-attribute` `instance-attribute` #

UNKNOWN = 'unknown'

ragbits.document_search.documents.document.DocumentMeta #

Bases: BaseModel

An object representing a document metadata.

document_type `instance-attribute` #

document_type: DocumentType

source `instance-attribute` #

source: Source

id `property` #

id: str

Get the document ID.

RETURNS	DESCRIPTION
`str`	The document ID.

fetch `async` #

fetch() -> Document

This method fetches the document from source (potentially remote) and creates an object to interface with it. Based on the document type, it will return a different object.

RETURNS	DESCRIPTION
`Document`	The document.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py

async def fetch(self) -> "Document":
    """
    This method fetches the document from source (potentially remote) and creates an object to interface with it.
    Based on the document type, it will return a different object.

    Returns:
        The document.
    """
    local_path = await self.source.fetch()
    return Document.from_document_meta(self, local_path)

create_text_document_from_literal `classmethod` #

create_text_document_from_literal(content: str) -> DocumentMeta

Create a text document from a literal content.

PARAMETER	DESCRIPTION
`content`	The content of the document. TYPE: `str`

RETURNS	DESCRIPTION
`DocumentMeta`	The document metadata.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py

@classmethod
def create_text_document_from_literal(cls, content: str) -> "DocumentMeta":
    """
    Create a text document from a literal content.

    Args:
        content: The content of the document.

    Returns:
        The document metadata.
    """
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(content.encode())

    return cls(
        document_type=DocumentType.TXT,
        source=LocalFileSource(path=Path(temp_file.name)),
    )

from_local_path `classmethod` #

from_local_path(local_path: Path) -> DocumentMeta

Create a document metadata from a local path.

PARAMETER	DESCRIPTION
`local_path`	The local path to the document. TYPE: `Path`

RETURNS	DESCRIPTION
`DocumentMeta`	The document metadata.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py

@classmethod
def from_local_path(cls, local_path: Path) -> "DocumentMeta":
    """
    Create a document metadata from a local path.

    Args:
        local_path: The local path to the document.

    Returns:
        The document metadata.
    """
    return cls(
        document_type=DocumentType(local_path.suffix[1:]),
        source=LocalFileSource(path=local_path),
    )

from_source `async` `classmethod` #

from_source(source: Source) -> DocumentMeta

Create a document metadata from a source.

PARAMETER	DESCRIPTION
`source`	The source from which the document is fetched. TYPE: `Source`

RETURNS	DESCRIPTION
`DocumentMeta`	The document metadata.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/document.py

@classmethod
async def from_source(cls, source: Source) -> "DocumentMeta":
    """
    Create a document metadata from a source.

    Args:
        source: The source from which the document is fetched.

    Returns:
        The document metadata.
    """
    path = await source.fetch()

    return cls(
        document_type=DocumentType(path.suffix[1:]),
        source=source,
    )

ragbits.document_search.documents.element.Element #

Bases: BaseModel, ABC

An object representing an element in a document.

element_type `instance-attribute` #

element_type: str

document_meta `instance-attribute` #

document_meta: DocumentMeta

location `class-attribute` `instance-attribute` #

location: ElementLocation | None = None

id `property` #

id: str

Retrieve the ID of the element, primarily used to represent the element's data.

RETURNS	DESCRIPTION
`str`	string representing element TYPE: `str`

key `property` #

key: str | None

Get the representation of the element for embedding.

RETURNS	DESCRIPTION
`str \| None`	The representation for embedding.

text_representation `abstractmethod` `property` #

text_representation: str | None

Get the text representation of the element.

RETURNS	DESCRIPTION
`str \| None`	The text representation.

image_representation `property` #

image_representation: bytes | None

Get the image representation of the element.

RETURNS	DESCRIPTION
`bytes \| None`	The image representation.

get_id_components #

get_id_components() -> dict[str, str]

Creates a dictionary of key value pairs of id components

RETURNS	DESCRIPTION
`dict`	a dictionary TYPE: `dict[str, str]`

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py

def get_id_components(self) -> dict[str, str]:
    """
    Creates a dictionary of key value pairs of id components

    Returns:
        dict: a dictionary
    """
    id_components = {
        "meta": self.document_meta.id,
        "type": self.element_type,
        "key": str(self.key),
        "text": str(self.text_representation),
        "location": str(self.location),
    }
    return id_components

from_vector_db_entry `classmethod` #

from_vector_db_entry(db_entry: VectorStoreEntry) -> Element

Create an element from a vector database entry.

PARAMETER	DESCRIPTION
`db_entry`	The vector database entry. TYPE: `VectorStoreEntry`

RETURNS	DESCRIPTION
`Element`	The element.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py

@classmethod
def from_vector_db_entry(cls, db_entry: VectorStoreEntry) -> "Element":
    """
    Create an element from a vector database entry.

    Args:
        db_entry: The vector database entry.

    Returns:
        The element.
    """
    element_type = db_entry.metadata["element_type"]
    element_cls = Element._elements_registry[element_type]
    if "embedding_type" in db_entry.metadata:
        del db_entry.metadata["embedding_type"]
    return element_cls(**db_entry.metadata)

to_vector_db_entry #

to_vector_db_entry() -> VectorStoreEntry

Create a vector database entry from the element.

RETURNS	DESCRIPTION
`VectorStoreEntry`	The vector database entry

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/element.py

def to_vector_db_entry(self) -> VectorStoreEntry:
    """
    Create a vector database entry from the element.

    Returns:
        The vector database entry
    """
    id_components = [
        self.id,
    ]
    vector_store_entry_id = uuid.uuid5(uuid.NAMESPACE_OID, ";".join(id_components))
    metadata = self.model_dump(exclude={"id", "key"})
    metadata["document_meta"]["source"]["id"] = self.document_meta.source.id

    return VectorStoreEntry(
        id=vector_store_entry_id, text=self.key, image_bytes=self.image_representation, metadata=metadata
    )

ragbits.document_search.documents.sources.Source #

Bases: BaseModel, ABC

An object representing a source.

protocol `class-attribute` #

protocol: str | None = None

id `abstractmethod` `property` #

id: str

Get the source ID.

RETURNS	DESCRIPTION
`str`	The source ID.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `abstractmethod` `async` #

fetch() -> Path

Load the source.

RETURNS	DESCRIPTION
`Path`	The path to the source.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@abstractmethod
async def fetch(self) -> Path:
    """
    Load the source.

    Returns:
        The path to the source.
    """

from_uri `abstractmethod` `async` `classmethod` #

from_uri(path: str) -> Sequence[Source]

Create Source instances from a URI path.

The path can contain glob patterns (asterisks) to match multiple sources, but pattern support varies by source type. Each source implementation defines which patterns it supports:

LocalFileSource: Supports full glob patterns ('', '*', etc.) via Path.glob
GCSSource: Supports simple prefix matching with '*' at the end of path
HuggingFaceSource: Does not support glob patterns

PARAMETER	DESCRIPTION
`path`	The path part of the URI (after protocol://). Pattern support depends on source type. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[Source]`	A sequence of Source objects matching the path pattern

RAISES	DESCRIPTION
`ValueError`	If the path contains unsupported pattern for this source type

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
@abstractmethod
async def from_uri(cls, path: str) -> Sequence["Source"]:
    """Create Source instances from a URI path.

    The path can contain glob patterns (asterisks) to match multiple sources, but pattern support
    varies by source type. Each source implementation defines which patterns it supports:

    - LocalFileSource: Supports full glob patterns ('*', '**', etc.) via Path.glob
    - GCSSource: Supports simple prefix matching with '*' at the end of path
    - HuggingFaceSource: Does not support glob patterns

    Args:
        path: The path part of the URI (after protocol://). Pattern support depends on source type.

    Returns:
        A sequence of Source objects matching the path pattern

    Raises:
        ValueError: If the path contains unsupported pattern for this source type
    """

ragbits.document_search.documents.sources.AzureBlobStorageSource #

Bases: Source

An object representing an Azure Blob Storage dataset source.

protocol `class-attribute` #

protocol: str = 'azure'

account_name `instance-attribute` #

account_name: str

container_name `instance-attribute` #

container_name: str

blob_name `instance-attribute` #

blob_name: str

id `property` #

id: str

Get the source ID, which is the full blob URL.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `async` #

fetch() -> Path

Downloads the blob to a temporary local file and returns the file path.

RETURNS	DESCRIPTION
`Path`	Path to the downloaded file.

RAISES	DESCRIPTION
`SourceNotFoundError`	If the blob source is not available.
`SourceConnectionError`	If the blob service connection is not available.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/azure.py

@requires_dependencies(["azure.storage.blob", "azure.core.exceptions"], "azure")
async def fetch(self) -> Path:
    """
    Downloads the blob to a temporary local file and returns the file path.

    Returns:
        Path to the downloaded file.

    Raises:
        SourceNotFoundError: If the blob source is not available.
        SourceConnectionError: If the blob service connection is not available.
    """
    container_local_dir = get_local_storage_dir() / self.account_name / self.container_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    path = container_local_dir / self.blob_name
    with trace(account_name=self.account_name, container=self.container_name, blob=self.blob_name) as outputs:
        try:
            blob_service = await self._get_blob_service(account_name=self.account_name)
            blob_client = blob_service.get_blob_client(container=self.container_name, blob=self.blob_name)
            Path(path).parent.mkdir(parents=True, exist_ok=True)
            stream = blob_client.download_blob()
            content = stream.readall()
            with open(path, "wb") as file:
                file.write(content)

        except ResourceNotFoundError as e:
            raise SourceNotFoundError(f"Blob {self.blob_name} not found in container {self.container_name}") from e
        except Exception as e:
            raise SourceConnectionError() from e
        outputs.path = path
    return path

from_uri `async` `classmethod` #

from_uri(path: str) -> Sequence[AzureBlobStorageSource]

Parses an Azure Blob Storage URI and returns an instance of AzureBlobStorageSource.

PARAMETER	DESCRIPTION
`path`	The Azure Blob Storage URI. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[AzureBlobStorageSource]`	Sequence["AzureBlobStorageSource"]: The parsed Azure Blob Storage URI.

RAISES	DESCRIPTION
`ValueError`	If the Azure Blob Storage URI is invalid.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/azure.py

@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["AzureBlobStorageSource"]:
    """
    Parses an Azure Blob Storage URI and returns an instance of AzureBlobStorageSource.

    Args:
        path (str): The Azure Blob Storage URI.

    Returns:
        Sequence["AzureBlobStorageSource"]: The parsed Azure Blob Storage URI.

    Raises:
        ValueError: If the Azure Blob Storage URI is invalid.
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "AzureBlobStorageSource only supports '*' at the end of path. "
            "Patterns like '**' or '?' are not supported."
        )
    parsed = urlparse(path)
    if not parsed.netloc or not parsed.path:
        raise ValueError("Invalid Azure Blob Storage URI format.")

    if parsed.scheme != "https":
        raise ValueError("Invalid scheme, expected 'https://account_name.blob.core.windows.net'.")

    if parsed.netloc.endswith("blob.core.windows.net"):
        account_name = parsed.netloc.replace(".blob.core.windows.net", "")
    else:
        raise ValueError("Invalid scheme, expected 'https://account_name.blob.core.windows.net'.")

    path_parts = parsed.path.lstrip("/").split("/", 1)
    if len(path_parts) != 2:  # noqa PLR2004
        raise ValueError("URI must include both container and blob name.")

    container_name, blob_name = path_parts
    if "*" in blob_name:
        if not blob_name.endswith("*") or "*" in blob_name[:-1]:
            raise ValueError(
                f"AzureBlobStorageSource only supports '*' at the end of path. Invalid pattern: {blob_name}."
            )
        blob_name = blob_name[:-1]
        return await cls.list_sources(container=container_name, blob_name=blob_name, account_name=account_name)

    # Return a single-element list (consistent with other sources)
    return [cls(account_name=account_name, container_name=container_name, blob_name=blob_name)]

list_sources `async` `classmethod` #

list_sources(account_name: str, container: str, blob_name: str = '') -> list[AzureBlobStorageSource]

List all sources in the given Azure container, matching the prefix.

PARAMETER	DESCRIPTION
`account_name`	The Azure storage account name. TYPE: `str`
`container`	The Azure container name. TYPE: `str`
`blob_name`	The prefix to match. TYPE: `str` DEFAULT: `''`

RETURNS	DESCRIPTION
`list[AzureBlobStorageSource]`	List of source objects.

RAISES	DESCRIPTION
`ImportError`	If the required 'azure-storage-blob' package is not installed
`SourceConnectionError`	If there's an error connecting to Azure

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/azure.py

@classmethod
@requires_dependencies(["azure.storage.blob"], "azure")
async def list_sources(
    cls, account_name: str, container: str, blob_name: str = ""
) -> list["AzureBlobStorageSource"]:
    """List all sources in the given Azure container, matching the prefix.

    Args:
        account_name (str): The Azure storage account name.
        container: The Azure container name.
        blob_name: The prefix to match.

    Returns:
        List of source objects.

    Raises:
        ImportError: If the required 'azure-storage-blob' package is not installed
        SourceConnectionError: If there's an error connecting to Azure
    """
    with trace(account_name=account_name, container=container, blob_name=blob_name) as outputs:
        blob_service = await cls._get_blob_service(account_name=account_name)
        try:
            container_client = blob_service.get_container_client(container)
            blobs = container_client.list_blobs(name_starts_with=blob_name)
            outputs.results = [
                AzureBlobStorageSource(container_name=container, blob_name=blob.name, account_name=account_name)
                for blob in blobs
            ]
            return outputs.results
        except Exception as e:
            raise SourceConnectionError() from e

ragbits.document_search.documents.sources.GCSSource #

Bases: Source

An object representing a GCS file source.

bucket `instance-attribute` #

bucket: str

object_name `instance-attribute` #

object_name: str

protocol `class-attribute` #

protocol: str = 'gcs'

id `property` #

id: str

Get unique identifier of the object in the source.

RETURNS	DESCRIPTION
`str`	Unique identifier.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

set_storage `classmethod` #

set_storage(storage: Storage | None) -> None

Set the storage client for all instances.

PARAMETER	DESCRIPTION
`storage`	The `gcloud-aio-storage` `Storage` object to use as the storage client. By default, the object will be created automatically. TYPE: `Storage \| None`

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py

@classmethod
def set_storage(cls, storage: "StorageClient | None") -> None:
    """Set the storage client for all instances.

    Args:
        storage: The `gcloud-aio-storage` `Storage` object to use as the storage client.
            By default, the object will be created automatically.
    """
    cls._storage = storage

fetch `async` #

fetch() -> Path

Fetch the file from Google Cloud Storage and store it locally.

The file is downloaded to a local directory specified by local_dir. If the file already exists locally, it will not be downloaded again. If the file doesn't exist locally, it will be fetched from GCS. The local directory is determined by the environment variable LOCAL_STORAGE_DIR. If this environment variable is not set, a temporary directory is used.

RETURNS	DESCRIPTION
`Path`	The local path to the downloaded file. TYPE: `Path`

RAISES	DESCRIPTION
`ImportError`	If the 'gcp' extra is not installed.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py

@traceable
@requires_dependencies(["gcloud.aio.storage"], "gcs")
async def fetch(self) -> Path:
    """
    Fetch the file from Google Cloud Storage and store it locally.

    The file is downloaded to a local directory specified by `local_dir`. If the file already exists locally,
    it will not be downloaded again. If the file doesn't exist locally, it will be fetched from GCS.
    The local directory is determined by the environment variable `LOCAL_STORAGE_DIR`. If this environment
    variable is not set, a temporary directory is used.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        ImportError: If the 'gcp' extra is not installed.
    """
    local_dir = get_local_storage_dir()
    bucket_local_dir = local_dir / self.bucket
    bucket_local_dir.mkdir(parents=True, exist_ok=True)
    path = bucket_local_dir / self.object_name
    with trace(bucket=self.bucket, object=self.object_name) as outputs:
        if not path.is_file():
            storage = await self._get_storage()
            async with storage as client:
                content = await client.download(self.bucket, self.object_name)
                Path(bucket_local_dir / self.object_name).parent.mkdir(parents=True, exist_ok=True)
                with open(path, mode="wb+") as file_object:
                    file_object.write(content)
        outputs.path = path
    return path

list_sources `async` `classmethod` #

list_sources(bucket: str, prefix: str = '') -> list[GCSSource]

List all sources in the given GCS bucket, matching the prefix.

PARAMETER	DESCRIPTION
`bucket`	The GCS bucket. TYPE: `str`
`prefix`	The prefix to match. TYPE: `str` DEFAULT: `''`

RETURNS	DESCRIPTION
`list[GCSSource]`	List of source objects.

RAISES	DESCRIPTION
`ImportError`	If the required 'gcloud-aio-storage' package is not installed

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py

@classmethod
@requires_dependencies(["gcloud.aio.storage"], "gcs")
async def list_sources(cls, bucket: str, prefix: str = "") -> list["GCSSource"]:
    """List all sources in the given GCS bucket, matching the prefix.

    Args:
        bucket: The GCS bucket.
        prefix: The prefix to match.

    Returns:
        List of source objects.

    Raises:
        ImportError: If the required 'gcloud-aio-storage' package is not installed
    """
    with trace() as outputs:
        async with await cls._get_storage() as storage:
            result = await storage.list_objects(bucket, params={"prefix": prefix})
            items = result.get("items", [])
            outputs.results = [
                cls(bucket=bucket, object_name=item["name"]) for item in items if not item["name"].endswith("/")
            ]
            return outputs.results

from_uri `async` `classmethod` #

from_uri(path: str) -> Sequence[GCSSource]

Create GCSSource instances from a URI path.

Supports simple prefix matching with '' at the end of path. For example: - "bucket/folder/" - matches all files in the folder - "bucket/folder/prefix*" - matches all files starting with prefix

More complex patterns like '**' or '?' are not supported.

PARAMETER	DESCRIPTION
`path`	The path part of the URI (after gcs://). Can end with '' for pattern matching. TYPE:* `str`

RETURNS	DESCRIPTION
`Sequence[GCSSource]`	A sequence of GCSSource objects matching the pattern

RAISES	DESCRIPTION
`ValueError`	If an unsupported pattern is used

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/gcs.py

@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["GCSSource"]:
    """Create GCSSource instances from a URI path.

    Supports simple prefix matching with '*' at the end of path.
    For example:
    - "bucket/folder/*" - matches all files in the folder
    - "bucket/folder/prefix*" - matches all files starting with prefix

    More complex patterns like '**' or '?' are not supported.

    Args:
        path: The path part of the URI (after gcs://). Can end with '*' for pattern matching.

    Returns:
        A sequence of GCSSource objects matching the pattern

    Raises:
        ValueError: If an unsupported pattern is used
    """
    if "**" in path or "?" in path:
        raise ValueError(
            "GCSSource only supports '*' at the end of path. Patterns like '**' or '?' are not supported."
        )

    # Split into bucket and prefix
    bucket, prefix = path.split("/", 1) if "/" in path else (path, "")

    if "*" in prefix:
        if not prefix.endswith("*"):
            raise ValueError(f"GCSSource only supports '*' at the end of path. Invalid pattern: {prefix}")
        # Remove the trailing * for GCS prefix listing
        prefix = prefix[:-1]
        return await cls.list_sources(bucket=bucket, prefix=prefix)

    return [cls(bucket=bucket, object_name=prefix)]

ragbits.document_search.documents.sources.GitSource #

Bases: Source

An object representing a file in a Git repository.

repo_url `instance-attribute` #

repo_url: str

file_path `instance-attribute` #

file_path: str

branch `class-attribute` `instance-attribute` #

branch: str | None = None

protocol `class-attribute` #

protocol: str = 'git'

id `property` #

id: str

Get the source ID, which is a unique identifier of the object.

RETURNS	DESCRIPTION
`str`	The source ID.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `async` #

fetch() -> Path

Clone the Git repository and return the path to the specific file.

RETURNS	DESCRIPTION
`Path`	The local path to the specific file in the cloned repository.

RAISES	DESCRIPTION
`SourceNotFoundError`	If the repository cannot be cloned or the file doesn't exist.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/git.py

@requires_dependencies(["git"])
@traceable
async def fetch(self) -> Path:
    """
    Clone the Git repository and return the path to the specific file.

    Returns:
        The local path to the specific file in the cloned repository.

    Raises:
        SourceNotFoundError: If the repository cannot be cloned or the file doesn't exist.
    """
    repo_dir = self._get_repo_dir(self.repo_url, self.branch)
    self._ensure_repo(self.repo_url, repo_dir, self.branch)

    # Check if the file exists in the repository
    file_path = repo_dir / self.file_path
    if not file_path.exists() or not file_path.is_file():
        raise SourceNotFoundError(f"File {self.file_path} not found in repository")

    return file_path

list_sources `async` `classmethod` #

list_sources(repo_url: str, file_pattern: str = '**/*', branch: str | None = None) -> list[GitSource]

List all files in the repository matching the pattern.

PARAMETER	DESCRIPTION
`repo_url`	URL of the git repository. TYPE: `str`
`file_pattern`	The glob pattern to match files. TYPE: `str` DEFAULT: `'*/'`
`branch`	Optional branch name. TYPE: `str \| None` DEFAULT: `None`

RETURNS	DESCRIPTION
`list[GitSource]`	List of GitSource objects, one for each file matching the pattern.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/git.py

@classmethod
@traceable
async def list_sources(
    cls, repo_url: str, file_pattern: str = "**/*", branch: str | None = None
) -> list["GitSource"]:
    """
    List all files in the repository matching the pattern.

    Args:
        repo_url: URL of the git repository.
        file_pattern: The glob pattern to match files.
        branch: Optional branch name.

    Returns:
        List of GitSource objects, one for each file matching the pattern.
    """
    repo_dir = cls._get_repo_dir(repo_url, branch)
    cls._ensure_repo(repo_url, repo_dir, branch)

    # Find all files matching the pattern
    matched_files = repo_dir.glob(file_pattern)
    file_sources = []

    for file_path in matched_files:
        if file_path.is_file():
            # Convert to relative path within the repository
            relative_path = file_path.relative_to(repo_dir)
            file_sources.append(cls(repo_url=repo_url, file_path=str(relative_path), branch=branch))

    return file_sources

from_uri `async` `classmethod` #

from_uri(uri: str) -> Sequence[GitSource]

Create GitSource instances from a URI path.

Supported URI formats: - git://https://github.com/username/repo.git:path/to/file.txt - git://https://github.com/username/repo.git:branch:path/to/file.txt - git@github.com:username/repo.git:path/to/file.txt - git@github.com:username/repo.git:branch:path/to/file.txt

PARAMETER	DESCRIPTION
`uri`	The URI path in the format described above. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[GitSource]`	A sequence containing a GitSource instance.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/git.py

@classmethod
@traceable
async def from_uri(cls, uri: str) -> Sequence["GitSource"]:
    """
    Create GitSource instances from a URI path.

    Supported URI formats:
    - git://https://github.com/username/repo.git:path/to/file.txt
    - git://https://github.com/username/repo.git:branch:path/to/file.txt
    - git@github.com:username/repo.git:path/to/file.txt
    - git@github.com:username/repo.git:branch:path/to/file.txt

    Args:
        uri: The URI path in the format described above.

    Returns:
        A sequence containing a GitSource instance.
    """
    # Check if URI starts with git:// protocol
    if uri.startswith("git://"):
        uri = uri[6:]  # Remove the git:// prefix

    parts = uri.split(":")
    sources = []

    if len(parts) == _REPO_AND_FILE_PARTS:
        # Repo URL and file path
        sources.append(cls(repo_url=parts[0], file_path=parts[1]))
    elif len(parts) >= _MIN_PARTS_WITH_PROTOCOL:
        # Handle SSH format (git@github.com:username/repo.git)
        if parts[0].startswith("git@"):
            repo_url = f"{parts[0]}:{parts[1]}"  # Reconstruct full SSH URL
            file_path = parts[2] if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[3]
            branch = None if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[2]
            sources.append(cls(repo_url=repo_url, file_path=file_path, branch=branch))
        # Handle HTTPS format
        elif parts[0] in ["http", "https"]:
            repo_url = f"{parts[0]}:{parts[1]}"
            file_path = parts[2] if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[3]
            branch = None if len(parts) == _MIN_PARTS_WITH_PROTOCOL else parts[2]
            sources.append(cls(repo_url=repo_url, file_path=file_path, branch=branch))
        else:
            # Repo URL, branch, and file path in standard format
            sources.append(cls(repo_url=parts[0], branch=parts[1], file_path=parts[2]))

    return sources

ragbits.document_search.documents.sources.HuggingFaceSource #

Bases: Source

An object representing a Hugging Face dataset source.

path `instance-attribute` #

path: str

split `class-attribute` `instance-attribute` #

split: str = 'train'

row `instance-attribute` #

row: int

protocol `class-attribute` #

protocol: str = 'huggingface'

id `property` #

id: str

Get unique identifier of the object in the source.

RETURNS	DESCRIPTION
`str`	Unique identifier.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `async` #

fetch() -> Path

Fetch the file from Hugging Face and store it locally.

RETURNS	DESCRIPTION
`Path`	The local path to the downloaded file. TYPE: `Path`

RAISES	DESCRIPTION
`ImportError`	If the 'huggingface' extra is not installed.
`SourceConnectionError`	If the source connection fails.
`SourceNotFoundError`	If the source document is not found.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/hf.py

@traceable
@requires_dependencies(["datasets"], "huggingface")
async def fetch(self) -> Path:
    """
    Fetch the file from Hugging Face and store it locally.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        ImportError: If the 'huggingface' extra is not installed.
        SourceConnectionError: If the source connection fails.
        SourceNotFoundError: If the source document is not found.
    """
    with trace(path=self.path, split=self.split, row=self.row) as outputs:
        try:
            dataset = load_dataset(self.path, split=self.split, streaming=True)  # type: ignore
        except ConnectionError as exc:
            raise SourceConnectionError() from exc
        except DatasetNotFoundError as exc:  # type: ignore
            raise SourceNotFoundError(source_id=self.id) from exc

        try:
            data = next(iter(dataset.skip(self.row).take(1)))  # type: ignore
        except StopIteration as exc:
            raise SourceNotFoundError(source_id=self.id) from exc

        storage_dir = get_local_storage_dir()
        source_dir = storage_dir / Path(data["source"]).parent
        source_dir.mkdir(parents=True, exist_ok=True)
        path = storage_dir / data["source"]

        if not path.is_file():
            with open(path, mode="w", encoding="utf-8") as file:
                file.write(data["content"])
        outputs.path = path
        return path

from_uri `async` `classmethod` #

from_uri(path: str) -> Sequence[HuggingFaceSource]

Create HuggingFaceSource instances from a URI path.

Pattern matching is not supported. The path must be in the format: huggingface://dataset_path/split/row

PARAMETER	DESCRIPTION
`path`	The path part of the URI (after huggingface://) TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[HuggingFaceSource]`	A sequence containing a single HuggingFaceSource

RAISES	DESCRIPTION
`ValueError`	If the path contains patterns or has invalid format

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/hf.py

@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["HuggingFaceSource"]:
    """Create HuggingFaceSource instances from a URI path.

    Pattern matching is not supported. The path must be in the format:
    huggingface://dataset_path/split/row

    Args:
        path: The path part of the URI (after huggingface://)

    Returns:
        A sequence containing a single HuggingFaceSource

    Raises:
        ValueError: If the path contains patterns or has invalid format
    """
    if "*" in path or "?" in path:
        raise ValueError(
            "HuggingFaceSource does not support patterns. Path must be in format: dataset_path/split/row"
        )

    try:
        dataset_path, split, row = path.split("/")
        return [cls(path=dataset_path, split=split, row=int(row))]
    except ValueError as err:
        raise ValueError("Invalid HuggingFace path format. Expected: dataset_path/split/row") from err

list_sources `async` `classmethod` #

list_sources(path: str, split: str) -> list[HuggingFaceSource]

List all sources in the given Hugging Face repository.

PARAMETER	DESCRIPTION
`path`	Path or name of the dataset. TYPE: `str`
`split`	Dataset split. TYPE: `str`

RETURNS	DESCRIPTION
`list[HuggingFaceSource]`	List of source objects.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/hf.py

@classmethod
@traceable
async def list_sources(cls, path: str, split: str) -> list["HuggingFaceSource"]:
    """
    List all sources in the given Hugging Face repository.

    Args:
        path: Path or name of the dataset.
        split: Dataset split.

    Returns:
        List of source objects.
    """
    sources = load_dataset(path, split=split)  # type: ignore
    cleaned_split = re.sub(r"\[.*?\]", "", split)
    return [
        cls(
            path=path,
            split=cleaned_split,
            row=row,
        )
        for row in range(len(sources))  # type: ignore
    ]

ragbits.document_search.documents.sources.LocalFileSource #

Bases: Source

An object representing a local file source.

path `instance-attribute` #

path: Path

protocol `class-attribute` #

protocol: str = 'file'

id `property` #

id: str

Get unique identifier of the object in the source.

RETURNS	DESCRIPTION
`str`	Unique identifier.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `async` #

fetch() -> Path

Fetch the source.

RETURNS	DESCRIPTION
`Path`	The local path to the object fetched from the source.

RAISES	DESCRIPTION
`SourceNotFoundError`	If the source document is not found.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/local.py

@traceable
async def fetch(self) -> Path:
    """
    Fetch the source.

    Returns:
        The local path to the object fetched from the source.

    Raises:
        SourceNotFoundError: If the source document is not found.
    """
    if not self.path.is_file():
        raise SourceNotFoundError(source_id=self.id)
    return self.path

list_sources `classmethod` #

list_sources(path: Path, file_pattern: str = '*') -> list[LocalFileSource]

List all sources in the given directory, matching the file pattern.

PARAMETER	DESCRIPTION
`path`	The path to the directory. TYPE: `Path`
`file_pattern`	The file pattern to match. TYPE: `str` DEFAULT: `'*'`

RETURNS	DESCRIPTION
`list[LocalFileSource]`	List of source objects.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/local.py

@classmethod
@traceable
def list_sources(cls, path: Path, file_pattern: str = "*") -> list["LocalFileSource"]:
    """
    List all sources in the given directory, matching the file pattern.

    Args:
        path: The path to the directory.
        file_pattern: The file pattern to match.

    Returns:
        List of source objects.
    """
    return [cls(path=file_path) for file_path in path.glob(file_pattern)]

from_uri `async` `classmethod` #

from_uri(path: str) -> Sequence[LocalFileSource]

Create LocalFileSource instances from a URI path.

Supports full glob patterns via Path.glob: - "/.txt" - all .txt files in any subdirectory - ".py" - all Python files in the current directory - "/*" - all files in any subdirectory - '?' matches exactly one character

PARAMETER	DESCRIPTION
`path`	The path part of the URI (after file://). Pattern support depends on source type. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[LocalFileSource]`	A sequence of LocalFileSource objects

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/local.py

@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["LocalFileSource"]:
    """Create LocalFileSource instances from a URI path.

    Supports full glob patterns via Path.glob:
    - "**/*.txt" - all .txt files in any subdirectory
    - "*.py" - all Python files in the current directory
    - "**/*" - all files in any subdirectory
    - '?' matches exactly one character

    Args:
        path: The path part of the URI (after file://). Pattern support depends on source type.

    Returns:
        A sequence of LocalFileSource objects
    """
    path_obj: Path = Path(path)
    base_path, pattern = cls._split_path_and_pattern(path=path_obj)
    if base_path.is_file():
        return [cls(path=base_path)]
    if not pattern:
        return []
    return [cls(path=f) for f in base_path.glob(pattern) if f.is_file()]

ragbits.document_search.documents.sources.S3Source #

Bases: Source

An object representing an AWS S3 Storage dataset source.

protocol `class-attribute` #

protocol: str = 's3'

bucket_name `instance-attribute` #

bucket_name: str

key `instance-attribute` #

key: str

id `property` #

id: str

Get the source ID, which is the full URL to the file in s3.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `async` #

fetch() -> Path

Download a file in the given bucket_name with the given key.

RETURNS	DESCRIPTION
`Path`	The local path to the downloaded file. TYPE: `Path`

RAISES	DESCRIPTION
`ClientError`	If the file doesn't exist or credentials are incomplete.
`NoCredentialsError`	If no credentials are available.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/s3.py

@requires_dependencies(["boto3"], "s3")
async def fetch(self) -> Path:
    """
    Download a file in the given bucket_name with the given key.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        ClientError: If the file doesn't exist or credentials are incomplete.
        NoCredentialsError: If no credentials are available.
    """
    if self._s3_client is None:
        self._set_client(self.bucket_name)

    if self._s3_client is None:
        raise RuntimeError("S3 client is not initialized.")

    local_dir = get_local_storage_dir()
    container_local_dir = local_dir / self.bucket_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    normalized_key = self.key.replace("/", "_")
    path = container_local_dir / normalized_key
    with trace(bucket=self.bucket_name, key=self.key) as outputs:
        try:
            self._s3_client.download_file(self.bucket_name, self.key, path)
        except ClientError as e:
            if e.response["Error"]["Code"] == "404":
                raise FileNotFoundError(f"The object does not exist: {self.key}") from e
            elif e.response["Error"]["Code"] == "403":
                raise PermissionError(f"Access denied. No permission to download: {self.key}") from e
            else:
                raise RuntimeError(f"S3 Client Error: {e}") from e
        except (NoCredentialsError, PartialCredentialsError) as e:
            raise ValueError("AWS credentials are missing or invalid.") from e
        outputs.path = path
    return path

list_sources `async` `classmethod` #

list_sources(bucket_name: str, prefix: str) -> Sequence[S3Source]

List all files under the given bucket name and with the given prefix.

PARAMETER	DESCRIPTION
`bucket_name`	The name of the S3 bucket to use. TYPE: `str`
`prefix`	The path to the files and prefix to look for. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence`	The Sequence of AWS S3 sources. TYPE: `Sequence[S3Source]`

RAISES	DESCRIPTION
`ClientError`	If the source doesn't exist.
`NoCredentialsError`	If no credentials are available.
`PartialCredentialsError`	If credentials are incomplete.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/s3.py

@classmethod
@requires_dependencies(["boto3"], "s3")
async def list_sources(cls, bucket_name: str, prefix: str) -> Sequence["S3Source"]:
    """
    List all files under the given bucket name and with the given prefix.

    Arguments:
        bucket_name: The name of the S3 bucket to use.
        prefix: The path to the files and prefix to look for.

    Returns:
        Sequence: The Sequence of AWS S3 sources.

    Raises:
        ClientError: If the source doesn't exist.
        NoCredentialsError: If no credentials are available.
        PartialCredentialsError: If credentials are incomplete.
    """
    cls._set_client(bucket_name)
    if cls._s3_client is None:
        raise RuntimeError("S3 client is not initialized.")
    with trace(bucket=bucket_name, key=prefix) as outputs:
        try:
            aws_sources_list = []
            paginator = cls._s3_client.get_paginator("list_objects_v2")
            for page in paginator.paginate(Bucket=bucket_name, Prefix=prefix):
                for obj in page.get("Contents", []):
                    key = obj["Key"]
                    aws_sources_list.append(cls(bucket_name=bucket_name, key=key))
            outputs.sources = aws_sources_list
            return aws_sources_list
        except (NoCredentialsError, PartialCredentialsError) as e:
            raise ValueError("AWS credentials are missing or incomplete. Please configure them.") from e
        except ClientError as e:
            raise RuntimeError(f"Failed to list files in bucket {bucket_name}: {e}") from e

from_uri `async` `classmethod` #

from_uri(path: str) -> Sequence[S3Source]

Create S3Source instances from a URI path. The supported paths formats are: s3:/// https://s3..amazonaws.com/ https://s3..amazonaws.com// Pattern matching is supported only with '*'.

PARAMETER	DESCRIPTION
`path`	The URI path. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[S3Source]`	A sequence containing a S3Source instances.

RAISES	DESCRIPTION
`ValueError`	If the path has invalid format

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/s3.py

@classmethod
@traceable
async def from_uri(cls, path: str) -> Sequence["S3Source"]:
    """
    Create S3Source instances from a URI path.
    The supported paths formats are:
    s3://<bucket-name>/<key>
    https://<bucket-name>s3.<region>.amazonaws.com/<key>
    https://s3.<region>.amazonaws.com/<bucket-name>/<key>
    Pattern matching is supported only with '*'.

    Args:
        path: The URI path.

    Returns:
        A sequence containing a S3Source instances.

    Raises:
        ValueError: If the path has invalid format

    """
    if "**" in path or "?" in path:
        raise ValueError(
            "S3Source only supports '*' at the end of path. Patterns like '**' or '?' are not supported."
        )

    parsed = urlparse(path)
    if not parsed.netloc or not parsed.path:
        raise ValueError("Invalid AWS Source URI format.")
    if parsed.scheme not in {"s3", "https"}:
        raise ValueError("Invalid AWS Source URI format.")

    if parsed.scheme == "s3":
        bucket_name = parsed.netloc
        path_to_file = parsed.path.lstrip("/")
    elif parsed.scheme == "https":
        if not parsed.netloc.endswith("amazonaws.com"):
            raise ValueError("Invalid AWS Source URI format.")
        elif parsed.netloc.startswith("s3"):
            parts = parsed.path.split("/")
            bucket_name = parts[1]
            path_to_file = "/".join(parts[2:])
        else:
            bucket_name = parsed.netloc.split(".")[0]
            path_to_file = parsed.path.lstrip("/")

    else:
        raise ValueError("Invalid AWS Source URI format.")

    if "*" in path_to_file:
        if not path_to_file.endswith("*") or "*" in path_to_file[:-1]:
            raise ValueError(f"AWS Source only supports '*' at the end of path. Invalid pattern: {[path_to_file]}.")
        path_to_file = path_to_file[:-1]
        return await cls.list_sources(bucket_name=bucket_name, prefix=path_to_file)

    return [cls(bucket_name=bucket_name, key=path_to_file)]

ragbits.document_search.documents.sources.WebSource #

Bases: Source

An object representing a Web dataset source.

url `instance-attribute` #

url: str

headers `class-attribute` `instance-attribute` #

headers: dict[str, str] | None = None

protocol `class-attribute` #

protocol: str = 'https'

id `property` #

id: str

Get the source ID, which is an unique identifier of the object.

class_identifier `classmethod` #

class_identifier() -> str

Get an identifier for the source type.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@classmethod
def class_identifier(cls) -> str:
    """
    Get an identifier for the source type.
    """
    return to_snake(cls.__name__)

source_type #

source_type() -> str

Pydantic field based on the class identifier.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/base.py

@computed_field
def source_type(self) -> str:
    """
    Pydantic field based on the class identifier.
    """
    return self.class_identifier()

fetch `async` #

fetch() -> Path

Download a file available in the given url.

RETURNS	DESCRIPTION
`Path`	The local path to the downloaded file. TYPE: `Path`

RAISES	DESCRIPTION
`WebDownloadError`	If the download failed.
`SourceNotFoundError`	If the URL is invalid.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/web.py

@requires_dependencies(["aiohttp"])
async def fetch(self) -> Path:
    """
    Download a file available in the given url.

    Returns:
        Path: The local path to the downloaded file.

    Raises:
        WebDownloadError: If the download failed.
        SourceNotFoundError: If the URL is invalid.
    """
    parsed_url = urlparse(self.url)
    url_path, file_name = ("/" + parsed_url.netloc + parsed_url.path).rsplit("/", 1)
    normalized_url_path = re.sub(r"\W", "_", url_path) + file_name
    domain_name = parsed_url.netloc

    local_dir = get_local_storage_dir()
    container_local_dir = local_dir / domain_name
    container_local_dir.mkdir(parents=True, exist_ok=True)
    path = container_local_dir / normalized_url_path

    try:
        async with aiohttp.ClientSession() as session, session.get(self.url, headers=self.headers) as response:
            if response.ok:
                with open(path, "wb") as f:
                    async for chunk in response.content.iter_chunked(1024):
                        f.write(chunk)
            else:
                raise WebDownloadError(url=self.url, code=response.status)
    except (aiohttp.ClientError, IsADirectoryError) as e:
        raise SourceNotFoundError(self.id) from e

    return path

list_sources `async` `classmethod` #

list_sources(url: str) -> Sequence[WebSource]

List the file under the given URL.

PARAMETER	DESCRIPTION
`url`	The URL to the file. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence`	The Sequence with Web source. TYPE: `Sequence[WebSource]`

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/web.py

@classmethod
async def list_sources(cls, url: str) -> Sequence["WebSource"]:
    """
    List the file under the given URL.

    Arguments:
        url: The URL to the file.

    Returns:
        Sequence: The Sequence with Web source.
    """
    return [cls(url=url)]

from_uri `async` `classmethod` #

from_uri(uri: str) -> Sequence[WebSource]

Create WebSource instances from a URI path. The supported uri format is: :////.

PARAMETER	DESCRIPTION
`uri`	The URI path. Needs to include the protocol. TYPE: `str`

RETURNS	DESCRIPTION
`Sequence[WebSource]`	A sequence containing a WebSource instance.

Source code in packages/ragbits-document-search/src/ragbits/document_search/documents/sources/web.py

@classmethod
async def from_uri(cls, uri: str) -> Sequence["WebSource"]:
    """
    Create WebSource instances from a URI path.
    The supported uri format is:
    <protocol>://<domain>/<path>/<filename>.<file_extension>

    Args:
        uri: The URI path. Needs to include the protocol.

    Returns:
        A sequence containing a WebSource instance.
    """
    return [cls(url=uri)]

Documents and Elements#

ragbits.document_search.documents.document.Document #

local_path instance-attribute #

metadata instance-attribute #

from_document_meta classmethod #

ragbits.document_search.documents.document.DocumentType #

MD class-attribute instance-attribute #

TXT class-attribute instance-attribute #

PDF class-attribute instance-attribute #

CSV class-attribute instance-attribute #

DOC class-attribute instance-attribute #

DOCX class-attribute instance-attribute #

HTML class-attribute instance-attribute #

EPUB class-attribute instance-attribute #

XLSX class-attribute instance-attribute #

XLS class-attribute instance-attribute #

ORG class-attribute instance-attribute #

ODT class-attribute instance-attribute #

PPT class-attribute instance-attribute #

PPTX class-attribute instance-attribute #

RST class-attribute instance-attribute #

RTF class-attribute instance-attribute #

TSV class-attribute instance-attribute #

JSON class-attribute instance-attribute #

XML class-attribute instance-attribute #

JPG class-attribute instance-attribute #

PNG class-attribute instance-attribute #

UNKNOWN class-attribute instance-attribute #

ragbits.document_search.documents.document.DocumentMeta #

document_type instance-attribute #

source instance-attribute #

id property #

fetch async #

create_text_document_from_literal classmethod #

from_local_path classmethod #

from_source async classmethod #

ragbits.document_search.documents.element.Element #

element_type instance-attribute #

document_meta instance-attribute #

location class-attribute instance-attribute #

id property #

key property #

text_representation abstractmethod property #

image_representation property #

get_id_components #

from_vector_db_entry classmethod #

to_vector_db_entry #

ragbits.document_search.documents.sources.Source #

protocol class-attribute #

id abstractmethod property #

class_identifier classmethod #

source_type #

fetch abstractmethod async #

from_uri abstractmethod async classmethod #

ragbits.document_search.documents.sources.AzureBlobStorageSource #

protocol class-attribute #

account_name instance-attribute #

container_name instance-attribute #

blob_name instance-attribute #

id property #

class_identifier classmethod #

source_type #

fetch async #

from_uri async classmethod #

list_sources async classmethod #

ragbits.document_search.documents.sources.GCSSource #

bucket instance-attribute #

object_name instance-attribute #

protocol class-attribute #

id property #

class_identifier classmethod #

source_type #

set_storage classmethod #

fetch async #

list_sources async classmethod #

from_uri async classmethod #

ragbits.document_search.documents.sources.GitSource #

repo_url instance-attribute #

file_path instance-attribute #

branch class-attribute instance-attribute #

local_path `instance-attribute` #

metadata `instance-attribute` #

from_document_meta `classmethod` #

MD `class-attribute` `instance-attribute` #

TXT `class-attribute` `instance-attribute` #

PDF `class-attribute` `instance-attribute` #

CSV `class-attribute` `instance-attribute` #

DOC `class-attribute` `instance-attribute` #

DOCX `class-attribute` `instance-attribute` #

HTML `class-attribute` `instance-attribute` #

EPUB `class-attribute` `instance-attribute` #

XLSX `class-attribute` `instance-attribute` #

XLS `class-attribute` `instance-attribute` #

ORG `class-attribute` `instance-attribute` #

ODT `class-attribute` `instance-attribute` #

PPT `class-attribute` `instance-attribute` #

PPTX `class-attribute` `instance-attribute` #

RST `class-attribute` `instance-attribute` #

RTF `class-attribute` `instance-attribute` #

TSV `class-attribute` `instance-attribute` #

JSON `class-attribute` `instance-attribute` #

XML `class-attribute` `instance-attribute` #

JPG `class-attribute` `instance-attribute` #

PNG `class-attribute` `instance-attribute` #

UNKNOWN `class-attribute` `instance-attribute` #

document_type `instance-attribute` #

source `instance-attribute` #

id `property` #

fetch `async` #

create_text_document_from_literal `classmethod` #

from_local_path `classmethod` #

from_source `async` `classmethod` #

element_type `instance-attribute` #

document_meta `instance-attribute` #

location `class-attribute` `instance-attribute` #

id `property` #

key `property` #

text_representation `abstractmethod` `property` #

image_representation `property` #

from_vector_db_entry `classmethod` #

protocol `class-attribute` #

id `abstractmethod` `property` #

class_identifier `classmethod` #

fetch `abstractmethod` `async` #

from_uri `abstractmethod` `async` `classmethod` #

protocol `class-attribute` #

account_name `instance-attribute` #

container_name `instance-attribute` #

blob_name `instance-attribute` #

id `property` #

class_identifier `classmethod` #

fetch `async` #

from_uri `async` `classmethod` #

list_sources `async` `classmethod` #

bucket `instance-attribute` #

object_name `instance-attribute` #

protocol `class-attribute` #

id `property` #

class_identifier `classmethod` #

set_storage `classmethod` #

fetch `async` #

list_sources `async` `classmethod` #

from_uri `async` `classmethod` #

repo_url `instance-attribute` #

file_path `instance-attribute` #

branch `class-attribute` `instance-attribute` #

protocol `class-attribute` #

id `property` #

class_identifier `classmethod` #

fetch `async` #

list_sources `async` `classmethod` #

from_uri `async` `classmethod` #

path `instance-attribute` #

split `class-attribute` `instance-attribute` #

row `instance-attribute` #

protocol `class-attribute` #

id `property` #

class_identifier `classmethod` #

fetch `async` #

from_uri `async` `classmethod` #