Skip to content

Reference for hub_sdk/modules/datasets.py

Note

This file is available at https://github.com/ultralytics/hub-sdk/blob/main/hub_sdk/modules/datasets.py. If you spot a problem please help fix it by contributing a Pull Request 🛠️. Thank you 🙏!


hub_sdk.modules.datasets.Datasets

Datasets(
    dataset_id: Optional[str] = None, headers: Optional[Dict[str, Any]] = None
)

Bases: CRUDClient

A class representing a client for interacting with Datasets through CRUD operations. This class extends the CRUDClient class and provides specific methods for working with Datasets.

Attributes:

Name Type Description
hub_client DatasetUpload

An instance of DatasetUpload used for interacting with model uploads.

id (str, None)

The unique identifier of the dataset, if available.

data dict

A dictionary to store dataset data.

Note

The 'id' attribute is set during initialization and can be used to uniquely identify a dataset. The 'data' attribute is used to store dataset data fetched from the API.

Parameters:

Name Type Description Default
dataset_id str

Unique id of the dataset.

None
headers dict

Headers to include in HTTP requests.

None
Source code in hub_sdk/modules/datasets.py
def __init__(self, dataset_id: Optional[str] = None, headers: Optional[Dict[str, Any]] = None):
    """
    Initialize a Datasets client.

    Args:
        dataset_id (str): Unique id of the dataset.
        headers (dict, optional): Headers to include in HTTP requests.
    """
    super().__init__("datasets", "dataset", headers)
    self.hub_client = DatasetUpload(headers)
    self.id = dataset_id
    self.data = {}
    if dataset_id:
        self.get_data()

create_dataset

create_dataset(dataset_data: dict) -> None

Creates a new dataset with the provided data and sets the dataset ID for the current instance.

Parameters:

Name Type Description Default
dataset_data dict

A dictionary containing the data for creating the dataset.

required

Returns:

Type Description
None

The method does not return a value.

Source code in hub_sdk/modules/datasets.py
def create_dataset(self, dataset_data: dict) -> None:
    """
    Creates a new dataset with the provided data and sets the dataset ID for the current instance.

    Args:
        dataset_data (dict): A dictionary containing the data for creating the dataset.

    Returns:
        (None): The method does not return a value.
    """
    resp = super().create(dataset_data).json()
    self.id = resp.get("data", {}).get("id")
    self.get_data()

delete

delete(hard: bool = False) -> Optional[Response]

Delete the dataset resource represented by this instance.

Parameters:

Name Type Description Default
hard bool

If True, perform a hard delete.

False
Note

The 'hard' parameter determines whether to perform a soft delete (default) or a hard delete. In a soft delete, the dataset might be marked as deleted but retained in the system. In a hard delete, the dataset is permanently removed from the system.

Returns:

Type Description
Optional[Response]

Response object from the delete request, or None if delete fails.

Source code in hub_sdk/modules/datasets.py
def delete(self, hard: bool = False) -> Optional[Response]:
    """
    Delete the dataset resource represented by this instance.

    Args:
        hard (bool, optional): If True, perform a hard delete.

    Note:
        The 'hard' parameter determines whether to perform a soft delete (default) or a hard delete.
        In a soft delete, the dataset might be marked as deleted but retained in the system.
        In a hard delete, the dataset is permanently removed from the system.

    Returns:
        (Optional[Response]): Response object from the delete request, or None if delete fails.
    """
    return super().delete(self.id, hard)

get_data

get_data() -> None

Retrieves data for the current dataset instance.

If a valid dataset ID has been set, it sends a request to fetch the dataset data and stores it in the instance. If no dataset ID has been set, it logs an error message.

Returns:

Type Description
None

The method does not return a value.

Source code in hub_sdk/modules/datasets.py
def get_data(self) -> None:
    """
    Retrieves data for the current dataset instance.

    If a valid dataset ID has been set, it sends a request to fetch the dataset data and stores it in the instance.
    If no dataset ID has been set, it logs an error message.

    Returns:
        (None): The method does not return a value.
    """
    if not self.id:
        self.logger.error("No dataset id has been set. Update the dataset id or create a dataset.")
        return

    try:
        response = super().read(self.id)

        if response is None:
            self.logger.error(f"Received no response from the server for dataset ID: {self.id}")
            return

        # Check if the response has a .json() method (it should if it's a response object)
        if not hasattr(response, "json"):
            self.logger.error(f"Invalid response object received for dataset ID: {self.id}")
            return

        resp_data = response.json()
        if resp_data is None:
            self.logger.error(f"No data received in the response for dataset ID: {self.id}")
            return

        self.data = resp_data.get("data", {})
        self.logger.debug(f"Dataset data retrieved for ID: {self.id}")

    except Exception as e:
        self.logger.error(f"An error occurred while retrieving data for dataset ID: {self.id}, {e}")
get_download_link() -> Optional[str]

Get dataset download link.

Returns:

Type Description
Optional[str]

Return download link or None if the link is not available.

Source code in hub_sdk/modules/datasets.py
def get_download_link(self) -> Optional[str]:
    """
    Get dataset download link.

    Returns:
        (Optional[str]): Return download link or None if the link is not available.
    """
    return self.data.get("url")

update

update(data: dict) -> Optional[Response]

Update the dataset resource represented by this instance.

Parameters:

Name Type Description Default
data dict

The updated data for the dataset resource.

required

Returns:

Type Description
Optional[Response]

Response object from the update request, or None if update fails.

Source code in hub_sdk/modules/datasets.py
def update(self, data: dict) -> Optional[Response]:
    """
    Update the dataset resource represented by this instance.

    Args:
        data (dict): The updated data for the dataset resource.

    Returns:
        (Optional[Response]): Response object from the update request, or None if update fails.
    """
    return super().update(self.id, data)

upload_dataset

upload_dataset(file: str = None) -> Optional[Response]

Uploads a dataset file to the hub.

Parameters:

Name Type Description Default
file str

The path to the dataset file to upload.

None

Returns:

Type Description
Optional[Response]

Response object from the upload request, or None if upload fails.

Source code in hub_sdk/modules/datasets.py
def upload_dataset(self, file: str = None) -> Optional[Response]:
    """
    Uploads a dataset file to the hub.

    Args:
        file (str, optional): The path to the dataset file to upload.

    Returns:
        (Optional[Response]): Response object from the upload request, or None if upload fails.
    """
    return self.hub_client.upload_dataset(self.id, file)





hub_sdk.modules.datasets.DatasetList

DatasetList(page_size=None, public=None, headers=None)

Bases: PaginatedList

A class for managing a paginated list of datasets from the Ultralytics Hub API.

Parameters:

Name Type Description Default
page_size int

The number of items to request per page.

None
public bool

Whether the items should be publicly accessible.

None
headers dict

Headers to be included in API requests.

None
Source code in hub_sdk/modules/datasets.py
def __init__(self, page_size=None, public=None, headers=None):
    """
    Initialize a Dataset instance.

    Args:
        page_size (int, optional): The number of items to request per page.
        public (bool, optional): Whether the items should be publicly accessible.
        headers (dict, optional): Headers to be included in API requests.
    """
    base_endpoint = "datasets"
    super().__init__(base_endpoint, "dataset", page_size, public, headers)