Skip to content

Reference for ultralytics/data/split.py

Note

This file is available at https://github.com/ultralytics/ultralytics/blob/main/ultralytics/data/split.py. If you spot a problem please help fix it by contributing a Pull Request ๐Ÿ› ๏ธ. Thank you ๐Ÿ™!


ultralytics.data.split.split_classify_dataset

split_classify_dataset(
    source_dir: str | Path, train_ratio: float = 0.8
) -> Path

Split classification dataset into train and val directories in a new directory.

Creates a new directory '{source_dir}_split' with train/val subdirectories, preserving the original class structure with an 80/20 split by default.

Directory structure

Before: caltech/ โ”œโ”€โ”€ class1/ โ”‚ โ”œโ”€โ”€ img1.jpg โ”‚ โ”œโ”€โ”€ img2.jpg โ”‚ โ””โ”€โ”€ ... โ”œโ”€โ”€ class2/ โ”‚ โ”œโ”€โ”€ img1.jpg โ”‚ โ””โ”€โ”€ ... โ””โ”€โ”€ ...

After: caltech_split/ โ”œโ”€โ”€ train/ โ”‚ โ”œโ”€โ”€ class1/ โ”‚ โ”‚ โ”œโ”€โ”€ img1.jpg โ”‚ โ”‚ โ””โ”€โ”€ ... โ”‚ โ”œโ”€โ”€ class2/ โ”‚ โ”‚ โ”œโ”€โ”€ img1.jpg โ”‚ โ”‚ โ””โ”€โ”€ ... โ”‚ โ””โ”€โ”€ ... โ””โ”€โ”€ val/ โ”œโ”€โ”€ class1/ โ”‚ โ”œโ”€โ”€ img2.jpg โ”‚ โ””โ”€โ”€ ... โ”œโ”€โ”€ class2/ โ”‚ โ””โ”€โ”€ ... โ””โ”€โ”€ ...

Parameters:

Name Type Description Default
source_dir str | Path

Path to classification dataset root directory.

required
train_ratio float

Ratio for train split, between 0 and 1.

0.8

Returns:

Type Description
Path

Path to the created split directory.

Examples:

Split dataset with default 80/20 ratio

>>> split_classify_dataset("path/to/caltech")

Split with custom ratio

>>> split_classify_dataset("path/to/caltech", 0.75)
Source code in ultralytics/data/split.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def split_classify_dataset(source_dir: str | Path, train_ratio: float = 0.8) -> Path:
    """
    Split classification dataset into train and val directories in a new directory.

    Creates a new directory '{source_dir}_split' with train/val subdirectories, preserving the original class
    structure with an 80/20 split by default.

    Directory structure:
        Before:
            caltech/
            โ”œโ”€โ”€ class1/
            โ”‚   โ”œโ”€โ”€ img1.jpg
            โ”‚   โ”œโ”€โ”€ img2.jpg
            โ”‚   โ””โ”€โ”€ ...
            โ”œโ”€โ”€ class2/
            โ”‚   โ”œโ”€โ”€ img1.jpg
            โ”‚   โ””โ”€โ”€ ...
            โ””โ”€โ”€ ...

        After:
            caltech_split/
            โ”œโ”€โ”€ train/
            โ”‚   โ”œโ”€โ”€ class1/
            โ”‚   โ”‚   โ”œโ”€โ”€ img1.jpg
            โ”‚   โ”‚   โ””โ”€โ”€ ...
            โ”‚   โ”œโ”€โ”€ class2/
            โ”‚   โ”‚   โ”œโ”€โ”€ img1.jpg
            โ”‚   โ”‚   โ””โ”€โ”€ ...
            โ”‚   โ””โ”€โ”€ ...
            โ””โ”€โ”€ val/
                โ”œโ”€โ”€ class1/
                โ”‚   โ”œโ”€โ”€ img2.jpg
                โ”‚   โ””โ”€โ”€ ...
                โ”œโ”€โ”€ class2/
                โ”‚   โ””โ”€โ”€ ...
                โ””โ”€โ”€ ...

    Args:
        source_dir (str | Path): Path to classification dataset root directory.
        train_ratio (float): Ratio for train split, between 0 and 1.

    Returns:
        (Path): Path to the created split directory.

    Examples:
        Split dataset with default 80/20 ratio
        >>> split_classify_dataset("path/to/caltech")

        Split with custom ratio
        >>> split_classify_dataset("path/to/caltech", 0.75)
    """
    source_path = Path(source_dir)
    split_path = Path(f"{source_path}_split")
    train_path, val_path = split_path / "train", split_path / "val"

    # Create directory structure
    split_path.mkdir(exist_ok=True)
    train_path.mkdir(exist_ok=True)
    val_path.mkdir(exist_ok=True)

    # Process class directories
    class_dirs = [d for d in source_path.iterdir() if d.is_dir()]
    total_images = sum(len(list(d.glob("*.*"))) for d in class_dirs)
    stats = f"{len(class_dirs)} classes, {total_images} images"
    LOGGER.info(f"Splitting {source_path} ({stats}) into {train_ratio:.0%} train, {1 - train_ratio:.0%} val...")

    for class_dir in class_dirs:
        # Create class directories
        (train_path / class_dir.name).mkdir(exist_ok=True)
        (val_path / class_dir.name).mkdir(exist_ok=True)

        # Split and copy files
        image_files = list(class_dir.glob("*.*"))
        random.shuffle(image_files)
        split_idx = int(len(image_files) * train_ratio)

        for img in image_files[:split_idx]:
            shutil.copy2(img, train_path / class_dir.name / img.name)

        for img in image_files[split_idx:]:
            shutil.copy2(img, val_path / class_dir.name / img.name)

    LOGGER.info(f"Split complete in {split_path} โœ…")
    return split_path





ultralytics.data.split.autosplit

autosplit(
    path: Path = DATASETS_DIR / "coco8/images",
    weights: tuple[float, float, float] = (0.9, 0.1, 0.0),
    annotated_only: bool = False,
) -> None

Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.

Parameters:

Name Type Description Default
path Path

Path to images directory.

DATASETS_DIR / 'coco8/images'
weights tuple

Train, validation, and test split fractions.

(0.9, 0.1, 0.0)
annotated_only bool

If True, only images with an associated txt file are used.

False

Examples:

Split images with default weights

>>> from ultralytics.data.split import autosplit
>>> autosplit()

Split with custom weights and annotated images only

>>> autosplit(path="path/to/images", weights=(0.8, 0.15, 0.05), annotated_only=True)
Source code in ultralytics/data/split.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def autosplit(
    path: Path = DATASETS_DIR / "coco8/images",
    weights: tuple[float, float, float] = (0.9, 0.1, 0.0),
    annotated_only: bool = False,
) -> None:
    """
    Automatically split a dataset into train/val/test splits and save the resulting splits into autosplit_*.txt files.

    Args:
        path (Path): Path to images directory.
        weights (tuple): Train, validation, and test split fractions.
        annotated_only (bool): If True, only images with an associated txt file are used.

    Examples:
        Split images with default weights
        >>> from ultralytics.data.split import autosplit
        >>> autosplit()

        Split with custom weights and annotated images only
        >>> autosplit(path="path/to/images", weights=(0.8, 0.15, 0.05), annotated_only=True)
    """
    path = Path(path)  # images dir
    files = sorted(x for x in path.rglob("*.*") if x.suffix[1:].lower() in IMG_FORMATS)  # image files only
    n = len(files)  # number of files
    random.seed(0)  # for reproducibility
    indices = random.choices([0, 1, 2], weights=weights, k=n)  # assign each image to a split

    txt = ["autosplit_train.txt", "autosplit_val.txt", "autosplit_test.txt"]  # 3 txt files
    for x in txt:
        if (path.parent / x).exists():
            (path.parent / x).unlink()  # remove existing

    LOGGER.info(f"Autosplitting images from {path}" + ", using *.txt labeled images only" * annotated_only)
    for i, img in TQDM(zip(indices, files), total=n):
        if not annotated_only or Path(img2label_paths([str(img)])[0]).exists():  # check label
            with open(path.parent / txt[i], "a", encoding="utf-8") as f:
                f.write(f"./{img.relative_to(path.parent).as_posix()}" + "\n")  # add image to txt file





๐Ÿ“… Created 6 months ago โœ๏ธ Updated 6 months ago