Reference for `hub_sdk/modules/datasets.py`

Note

This file is available at https://github.com/ultralytics/hub-sdk/blob/main/hub_sdk/modules/datasets.py. If you spot a problem please help fix it by contributing a Pull Request 🛠️. Thank you 🙏!

hub_sdk.modules.datasets.Datasets

Datasets(dataset_id: Optional[str] = None, headers: Optional[Dict[str, Any]] = None)

Bases: CRUDClient

A class representing a client for interacting with Datasets through CRUD operations. This class extends the CRUDClient class and provides specific methods for working with Datasets.

Attributes:

Name	Type	Description
`hub_client`	`DatasetUpload`	An instance of DatasetUpload used for interacting with model uploads.
`id`	`(str, None)`	The unique identifier of the dataset, if available.
`data`	`dict`	A dictionary to store dataset data.

Note

The 'id' attribute is set during initialization and can be used to uniquely identify a dataset. The 'data' attribute is used to store dataset data fetched from the API.

Parameters:

Name	Type	Description	Default
`dataset_id`	`str`	Unique id of the dataset.	`None`
`headers`	`dict`	Headers to include in HTTP requests.	`None`

Source code in hub_sdk/modules/datasets.py

def __init__(self, dataset_id: Optional[str] = None, headers: Optional[Dict[str, Any]] = None):
    """
    Initialize a Datasets client.

    Args:
        dataset_id (str): Unique id of the dataset.
        headers (dict, optional): Headers to include in HTTP requests.
    """
    super().__init__("datasets", "dataset", headers)
    self.hub_client = DatasetUpload(headers)
    self.id = dataset_id
    self.data = {}
    if dataset_id:
        self.get_data()

create_dataset

create_dataset(dataset_data: dict) -> None

Creates a new dataset with the provided data and sets the dataset ID for the current instance.

Parameters:

Name	Type	Description	Default
`dataset_data`	`dict`	A dictionary containing the data for creating the dataset.	required

Returns:

Type	Description
`None`	The method does not return a value.

Source code in hub_sdk/modules/datasets.py

def create_dataset(self, dataset_data: dict) -> None:
    """
    Creates a new dataset with the provided data and sets the dataset ID for the current instance.

    Args:
        dataset_data (dict): A dictionary containing the data for creating the dataset.

    Returns:
        (None): The method does not return a value.
    """
    resp = super().create(dataset_data).json()
    self.id = resp.get("data", {}).get("id")
    self.get_data()

delete

delete(hard: bool = False) -> Optional[Response]

Delete the dataset resource represented by this instance.

Parameters:

Name	Type	Description	Default
`hard`	`bool`	If True, perform a hard delete.	`False`

Note

The 'hard' parameter determines whether to perform a soft delete (default) or a hard delete. In a soft delete, the dataset might be marked as deleted but retained in the system. In a hard delete, the dataset is permanently removed from the system.

Returns:

Type	Description
`Optional[Response]`	Response object from the delete request, or None if delete fails.

Source code in hub_sdk/modules/datasets.py

def delete(self, hard: bool = False) -> Optional[Response]:
    """
    Delete the dataset resource represented by this instance.

    Args:
        hard (bool, optional): If True, perform a hard delete.

    Note:
        The 'hard' parameter determines whether to perform a soft delete (default) or a hard delete.
        In a soft delete, the dataset might be marked as deleted but retained in the system.
        In a hard delete, the dataset is permanently removed from the system.

    Returns:
        (Optional[Response]): Response object from the delete request, or None if delete fails.
    """
    return super().delete(self.id, hard)

get_data

get_data() -> None

Retrieves data for the current dataset instance.

If a valid dataset ID has been set, it sends a request to fetch the dataset data and stores it in the instance. If no dataset ID has been set, it logs an error message.

Returns:

Type	Description
`None`	The method does not return a value.

Source code in hub_sdk/modules/datasets.py

def get_data(self) -> None:
    """
    Retrieves data for the current dataset instance.

    If a valid dataset ID has been set, it sends a request to fetch the dataset data and stores it in the instance.
    If no dataset ID has been set, it logs an error message.

    Returns:
        (None): The method does not return a value.
    """
    if not self.id:
        self.logger.error("No dataset id has been set. Update the dataset id or create a dataset.")
        return

    try:
        response = super().read(self.id)

        if response is None:
            self.logger.error(f"Received no response from the server for dataset ID: {self.id}")
            return

        # Check if the response has a .json() method (it should if it's a response object)
        if not hasattr(response, "json"):
            self.logger.error(f"Invalid response object received for dataset ID: {self.id}")
            return

        resp_data = response.json()
        if resp_data is None:
            self.logger.error(f"No data received in the response for dataset ID: {self.id}")
            return

        self.data = resp_data.get("data", {})
        self.logger.debug(f"Dataset data retrieved for ID: {self.id}")

    except Exception as e:
        self.logger.error(f"An error occurred while retrieving data for dataset ID: {self.id}, {e}")

get_download_link

get_download_link() -> Optional[str]

Get dataset download link.

Returns:

Type	Description
`Optional[str]`	Return download link or None if the link is not available.

Source code in hub_sdk/modules/datasets.py

def get_download_link(self) -> Optional[str]:
    """
    Get dataset download link.

    Returns:
        (Optional[str]): Return download link or None if the link is not available.
    """
    return self.data.get("url")

update

update(data: dict) -> Optional[Response]

Update the dataset resource represented by this instance.

Parameters:

Name	Type	Description	Default
`data`	`dict`	The updated data for the dataset resource.	required

Returns:

Type	Description
`Optional[Response]`	Response object from the update request, or None if update fails.

Source code in hub_sdk/modules/datasets.py

def update(self, data: dict) -> Optional[Response]:
    """
    Update the dataset resource represented by this instance.

    Args:
        data (dict): The updated data for the dataset resource.

    Returns:
        (Optional[Response]): Response object from the update request, or None if update fails.
    """
    return super().update(self.id, data)

upload_dataset

upload_dataset(file: str = None) -> Optional[Response]

Uploads a dataset file to the hub.

Parameters:

Name	Type	Description	Default
`file`	`str`	The path to the dataset file to upload.	`None`

Returns:

Type	Description
`Optional[Response]`	Response object from the upload request, or None if upload fails.

Source code in hub_sdk/modules/datasets.py

def upload_dataset(self, file: str = None) -> Optional[Response]:
    """
    Uploads a dataset file to the hub.

    Args:
        file (str, optional): The path to the dataset file to upload.

    Returns:
        (Optional[Response]): Response object from the upload request, or None if upload fails.
    """
    return self.hub_client.upload_dataset(self.id, file)

hub_sdk.modules.datasets.DatasetList

DatasetList(page_size=None, public=None, headers=None)

Bases: PaginatedList

A class for managing a paginated list of datasets from the Ultralytics Hub API.

Parameters:

Name	Type	Description	Default
`page_size`	`int`	The number of items to request per page.	`None`
`public`	`bool`	Whether the items should be publicly accessible.	`None`
`headers`	`dict`	Headers to be included in API requests.	`None`

Source code in hub_sdk/modules/datasets.py

def __init__(self, page_size=None, public=None, headers=None):
    """
    Initialize a Dataset instance.

    Args:
        page_size (int, optional): The number of items to request per page.
        public (bool, optional): Whether the items should be publicly accessible.
        headers (dict, optional): Headers to be included in API requests.
    """
    base_endpoint = "datasets"
    super().__init__(base_endpoint, "dataset", page_size, public, headers)

Reference for hub_sdk/modules/datasets.py

hub_sdk.modules.datasets.Datasets

create_dataset

delete

get_data

get_download_link

update

upload_dataset

hub_sdk.modules.datasets.DatasetList

Reference for `hub_sdk/modules/datasets.py`