Skip to content

Reference for ultralytics/models/yolo/world/


This file is available at If you spot a problem please help fix it by contributing a Pull Request 🛠️. Thank you 🙏!

Bases: DetectionTrainer

A class to fine-tune a world model on a close-set dataset.

from import WorldModel

args = dict(model='', data='coco8.yaml', epochs=3)
trainer = WorldTrainer(overrides=args)
Source code in ultralytics/models/yolo/world/
class WorldTrainer(yolo.detect.DetectionTrainer):
    A class to fine-tune a world model on a close-set dataset.

        from import WorldModel

        args = dict(model='', data='coco8.yaml', epochs=3)
        trainer = WorldTrainer(overrides=args)

    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
        """Initialize a WorldTrainer object with given arguments."""
        if overrides is None:
            overrides = {}
        super().__init__(cfg, overrides, _callbacks)

        # Import and assign clip
            import clip
        except ImportError:
            import clip
        self.clip = clip

    def get_model(self, cfg=None, weights=None, verbose=True):
        """Return WorldModel initialized with specified config and weights."""
        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
        # NOTE: Following the official config, nc hard-coded to 80 for now.
        model = WorldModel(
            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
            nc=min(["nc"], 80),
            verbose=verbose and RANK == -1,
        if weights:
        self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)

        return model

    def build_dataset(self, img_path, mode="train", batch=None):
        Build YOLO Dataset.

            img_path (str): Path to the folder containing images.
            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
        gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
        return build_yolo_dataset(
            self.args, img_path, batch,, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"

    def preprocess_batch(self, batch):
        """Preprocesses a batch of images for YOLOWorld training, adjusting formatting and dimensions as needed."""
        batch = super().preprocess_batch(batch)

        # NOTE: add text features
        texts = list(itertools.chain(*batch["texts"]))
        text_token = self.clip.tokenize(texts).to(batch["img"].device)
        txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype)  # torch.float32
        txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
        batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
        return batch

__init__(cfg=DEFAULT_CFG, overrides=None, _callbacks=None)

Initialize a WorldTrainer object with given arguments.

Source code in ultralytics/models/yolo/world/
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
    """Initialize a WorldTrainer object with given arguments."""
    if overrides is None:
        overrides = {}
    super().__init__(cfg, overrides, _callbacks)

    # Import and assign clip
        import clip
    except ImportError:
        import clip
    self.clip = clip

build_dataset(img_path, mode='train', batch=None)

Build YOLO Dataset.


Name Type Description Default
img_path str

Path to the folder containing images.

mode str

train mode or val mode, users are able to customize different augmentations for each mode.

batch int

Size of batches, this is for rect. Defaults to None.

Source code in ultralytics/models/yolo/world/
def build_dataset(self, img_path, mode="train", batch=None):
    Build YOLO Dataset.

        img_path (str): Path to the folder containing images.
        mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
        batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
    gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
    return build_yolo_dataset(
        self.args, img_path, batch,, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"

get_model(cfg=None, weights=None, verbose=True)

Return WorldModel initialized with specified config and weights.

Source code in ultralytics/models/yolo/world/
def get_model(self, cfg=None, weights=None, verbose=True):
    """Return WorldModel initialized with specified config and weights."""
    # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
    # NOTE: Following the official config, nc hard-coded to 80 for now.
    model = WorldModel(
        cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
        nc=min(["nc"], 80),
        verbose=verbose and RANK == -1,
    if weights:
    self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)

    return model


Preprocesses a batch of images for YOLOWorld training, adjusting formatting and dimensions as needed.

Source code in ultralytics/models/yolo/world/
def preprocess_batch(self, batch):
    """Preprocesses a batch of images for YOLOWorld training, adjusting formatting and dimensions as needed."""
    batch = super().preprocess_batch(batch)

    # NOTE: add text features
    texts = list(itertools.chain(*batch["texts"]))
    text_token = self.clip.tokenize(texts).to(batch["img"].device)
    txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype)  # torch.float32
    txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
    batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
    return batch


Source code in ultralytics/models/yolo/world/
def on_pretrain_routine_end(trainer):
    if RANK in {-1, 0}:
        # NOTE: for evaluation
        names = [name.split("/")[0] for name in list(["names"].values())]
        de_parallel(trainer.ema.ema).set_classes(names, cache_clip_model=False)
    device = next(trainer.model.parameters()).device
    trainer.text_model, _ = trainer.clip.load("ViT-B/32", device=device)
    for p in trainer.text_model.parameters():

Created 2024-03-31, Updated 2024-05-08
Authors: Burhan-Q (1), Laughing-q (1)