Skip to content

Reference for ultralytics/utils/dist.py

Improvements

This page is sourced from https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/dist.py. Have an improvement or example to add? Open a Pull Request — thank you! 🙏


function ultralytics.utils.dist.find_free_network_port

def find_free_network_port() -> int

Find a free port on localhost.

It is useful in single-node training when we don't want to connect to a real main node but have to set the MASTER_PORT environment variable.

Returns

TypeDescription
intThe available network port number.
Source code in ultralytics/utils/dist.pyView on GitHub
def find_free_network_port() -> int:
    """Find a free port on localhost.

    It is useful in single-node training when we don't want to connect to a real main node but have to set the
    `MASTER_PORT` environment variable.

    Returns:
        (int): The available network port number.
    """
    import socket

    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(("127.0.0.1", 0))
        return s.getsockname()[1]  # port





function ultralytics.utils.dist.generate_ddp_file

def generate_ddp_file(trainer: BaseTrainer) -> str

Generate a DDP (Distributed Data Parallel) file for multi-GPU training.

This function creates a temporary Python file that enables distributed training across multiple GPUs. The file contains the necessary configuration to initialize the trainer in a distributed environment.

Args

NameTypeDescriptionDefault
trainerultralytics.engine.trainer.BaseTrainerThe trainer containing training configuration and arguments.
Must have args attribute and be a class instance.
required

Returns

TypeDescription
strPath to the generated temporary DDP file.

Notes

The generated file is saved in the USER_CONFIG_DIR/DDP directory and includes: - Trainer class import - Configuration overrides from the trainer arguments - Model path configuration - Training initialization code

Source code in ultralytics/utils/dist.pyView on GitHub
def generate_ddp_file(trainer: BaseTrainer) -> str:
    """Generate a DDP (Distributed Data Parallel) file for multi-GPU training.

    This function creates a temporary Python file that enables distributed training across multiple GPUs. The file
    contains the necessary configuration to initialize the trainer in a distributed environment.

    Args:
        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer containing training configuration and arguments.
            Must have args attribute and be a class instance.

    Returns:
        (str): Path to the generated temporary DDP file.

    Notes:
        The generated file is saved in the USER_CONFIG_DIR/DDP directory and includes:
        - Trainer class import
        - Configuration overrides from the trainer arguments
        - Model path configuration
        - Training initialization code
    """
    module, name = f"{trainer.__class__.__module__}.{trainer.__class__.__name__}".rsplit(".", 1)

    content = f"""
# Ultralytics Multi-GPU training temp file (should be automatically deleted after use)
from pathlib import Path, PosixPath  # For model arguments stored as Path instead of str
overrides = {vars(trainer.args)}

if __name__ == "__main__":
    from {module} import {name}
    from ultralytics.utils import DEFAULT_CFG_DICT

    cfg = DEFAULT_CFG_DICT.copy()
    cfg.update(save_dir='')   # handle the extra key 'save_dir'
    trainer = {name}(cfg=cfg, overrides=overrides)
    trainer.args.model = "{getattr(trainer.hub_session, "model_url", trainer.args.model)}"
    results = trainer.train()
"""
    (USER_CONFIG_DIR / "DDP").mkdir(exist_ok=True)
    with tempfile.NamedTemporaryFile(
        prefix="_temp_",
        suffix=f"{id(trainer)}.py",
        mode="w+",
        encoding="utf-8",
        dir=USER_CONFIG_DIR / "DDP",
        delete=False,
    ) as file:
        file.write(content)
    return file.name





function ultralytics.utils.dist.generate_ddp_command

def generate_ddp_command(trainer: BaseTrainer) -> tuple[list[str], str]

Generate command for distributed training.

Args

NameTypeDescriptionDefault
trainerultralytics.engine.trainer.BaseTrainerThe trainer containing configuration for distributed training.required

Returns

TypeDescription
cmd (list[str])The command to execute for distributed training.
file (str)Path to the temporary file created for DDP training.
Source code in ultralytics/utils/dist.pyView on GitHub
def generate_ddp_command(trainer: BaseTrainer) -> tuple[list[str], str]:
    """Generate command for distributed training.

    Args:
        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer containing configuration for distributed training.

    Returns:
        cmd (list[str]): The command to execute for distributed training.
        file (str): Path to the temporary file created for DDP training.
    """
    import __main__  # noqa local import to avoid https://github.com/Lightning-AI/pytorch-lightning/issues/15218

    if not trainer.resume:
        shutil.rmtree(trainer.save_dir)  # remove the save_dir
    file = generate_ddp_file(trainer)
    dist_cmd = "torch.distributed.run" if TORCH_1_9 else "torch.distributed.launch"
    port = find_free_network_port()
    cmd = [
        sys.executable,
        "-m",
        dist_cmd,
        "--nproc_per_node",
        f"{trainer.world_size}",
        "--master_port",
        f"{port}",
        file,
    ]
    return cmd, file





function ultralytics.utils.dist.ddp_cleanup

def ddp_cleanup(trainer: BaseTrainer, file: str) -> None

Delete temporary file if created during distributed data parallel (DDP) training.

This function checks if the provided file contains the trainer's ID in its name, indicating it was created as a temporary file for DDP training, and deletes it if so.

Args

NameTypeDescriptionDefault
trainerultralytics.engine.trainer.BaseTrainerThe trainer used for distributed training.required
filestrPath to the file that might need to be deleted.required

Examples

>>> trainer = YOLOTrainer()
>>> file = "/tmp/ddp_temp_123456789.py"
>>> ddp_cleanup(trainer, file)
Source code in ultralytics/utils/dist.pyView on GitHub
def ddp_cleanup(trainer: BaseTrainer, file: str) -> None:
    """Delete temporary file if created during distributed data parallel (DDP) training.

    This function checks if the provided file contains the trainer's ID in its name, indicating it was created as a
    temporary file for DDP training, and deletes it if so.

    Args:
        trainer (ultralytics.engine.trainer.BaseTrainer): The trainer used for distributed training.
        file (str): Path to the file that might need to be deleted.

    Examples:
        >>> trainer = YOLOTrainer()
        >>> file = "/tmp/ddp_temp_123456789.py"
        >>> ddp_cleanup(trainer, file)
    """
    if f"{id(trainer)}.py" in file:  # if temp_file suffix in file
        os.remove(file)





📅 Created 2 years ago ✏️ Updated 2 months ago
glenn-jocherjk4eBurhan-Q