HUB training session for Ultralytics HUB YOLO models. Handles model initialization, heartbeats, and checkpointing.
This class encapsulates the functionality for interacting with Ultralytics HUB during model training, including
model creation, metrics tracking, and checkpoint uploading.
Attributes:
Name |
Type |
Description |
model_id |
str
|
Identifier for the YOLO model being trained.
|
model_url |
str
|
URL for the model in Ultralytics HUB.
|
rate_limits |
dict
|
Rate limits for different API calls (in seconds).
|
timers |
dict
|
Timers for rate limiting.
|
metrics_queue |
dict
|
Queue for the model's metrics.
|
metrics_upload_failed_queue |
dict
|
Queue for metrics that failed to upload.
|
model |
dict
|
Model data fetched from Ultralytics HUB.
|
model_file |
str
|
|
train_args |
dict
|
Arguments for training the model.
|
client |
HUBClient
|
Client for interacting with Ultralytics HUB.
|
filename |
str
|
|
Examples:
>>> session = HUBTrainingSession("https://hub.ultralytics.com/models/example-model")
>>> session.upload_metrics()
Parameters:
Name |
Type |
Description |
Default |
identifier
|
str
|
Model identifier used to initialize the HUB training session.
It can be a URL string or a model key with specific format.
|
required
|
Raises:
Type |
Description |
ValueError
|
If the provided model identifier is invalid.
|
ConnectionError
|
If connecting with global API key is not supported.
|
ModuleNotFoundError
|
If hub-sdk package is not installed.
|
Source code in ultralytics/hub/session.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89 | def __init__(self, identifier):
"""
Initialize the HUBTrainingSession with the provided model identifier.
Args:
identifier (str): Model identifier used to initialize the HUB training session.
It can be a URL string or a model key with specific format.
Raises:
ValueError: If the provided model identifier is invalid.
ConnectionError: If connecting with global API key is not supported.
ModuleNotFoundError: If hub-sdk package is not installed.
"""
from hub_sdk import HUBClient
self.rate_limits = {"metrics": 3, "ckpt": 900, "heartbeat": 300} # rate limits (seconds)
self.metrics_queue = {} # holds metrics for each epoch until upload
self.metrics_upload_failed_queue = {} # holds metrics for each epoch if upload failed
self.timers = {} # holds timers in ultralytics/utils/callbacks/hub.py
self.model = None
self.model_url = None
self.model_file = None
self.train_args = None
# Parse input
api_key, model_id, self.filename = self._parse_identifier(identifier)
# Get credentials
active_key = api_key or SETTINGS.get("api_key")
credentials = {"api_key": active_key} if active_key else None # set credentials
# Initialize client
self.client = HUBClient(credentials)
# Load models
try:
if model_id:
self.load_model(model_id) # load existing model
else:
self.model = self.client.model() # load empty model
except Exception:
if identifier.startswith(f"{HUB_WEB_ROOT}/models/") and not self.client.authenticated:
LOGGER.warning(
f"{PREFIX}Please log in using 'yolo login API_KEY'. "
"You can find your API Key at: https://hub.ultralytics.com/settings?tab=api+keys."
)
|
create_model
Initialize a HUB training session with the specified model arguments.
Parameters:
Name |
Type |
Description |
Default |
model_args
|
dict
|
Arguments for creating the model, including batch size, epochs, image size, etc.
|
required
|
Returns:
Type |
Description |
None
|
If the model could not be created.
|
Source code in ultralytics/hub/session.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181 | def create_model(self, model_args):
"""
Initialize a HUB training session with the specified model arguments.
Args:
model_args (dict): Arguments for creating the model, including batch size, epochs, image size, etc.
Returns:
(None): If the model could not be created.
"""
payload = {
"config": {
"batchSize": model_args.get("batch", -1),
"epochs": model_args.get("epochs", 300),
"imageSize": model_args.get("imgsz", 640),
"patience": model_args.get("patience", 100),
"device": str(model_args.get("device", "")), # convert None to string
"cache": str(model_args.get("cache", "ram")), # convert True, False, None to string
},
"dataset": {"name": model_args.get("data")},
"lineage": {
"architecture": {"name": self.filename.replace(".pt", "").replace(".yaml", "")},
"parent": {},
},
"meta": {"name": self.filename},
}
if self.filename.endswith(".pt"):
payload["lineage"]["parent"]["name"] = self.filename
self.model.create_model(payload)
# Model could not be created
# TODO: improve error handling
if not self.model.id:
return None
self.model_url = f"{HUB_WEB_ROOT}/models/{self.model.id}"
# Start heartbeats for HUB to monitor agent
self.model.start_heartbeat(self.rate_limits["heartbeat"])
LOGGER.info(f"{PREFIX}View model at {self.model_url} 🚀")
|
create_session
classmethod
create_session(identifier, args=None)
Create an authenticated HUBTrainingSession or return None.
Parameters:
Name |
Type |
Description |
Default |
identifier
|
str
|
Model identifier used to initialize the HUB training session.
|
required
|
args
|
dict
|
Arguments for creating a new model if identifier is not a HUB model URL.
|
None
|
Returns:
Type |
Description |
HUBTrainingSession | None
|
An authenticated session or None if creation fails.
|
Source code in ultralytics/hub/session.py
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111 | @classmethod
def create_session(cls, identifier, args=None):
"""
Create an authenticated HUBTrainingSession or return None.
Args:
identifier (str): Model identifier used to initialize the HUB training session.
args (dict, optional): Arguments for creating a new model if identifier is not a HUB model URL.
Returns:
(HUBTrainingSession | None): An authenticated session or None if creation fails.
"""
try:
session = cls(identifier)
if args and not identifier.startswith(f"{HUB_WEB_ROOT}/models/"): # not a HUB model URL
session.create_model(args)
assert session.model.id, "HUB model not loaded correctly"
return session
# PermissionError and ModuleNotFoundError indicate hub-sdk not installed
except (PermissionError, ModuleNotFoundError, AssertionError):
return None
|
load_model
Load an existing model from Ultralytics HUB using the provided model identifier.
Parameters:
Name |
Type |
Description |
Default |
model_id
|
str
|
The identifier of the model to load.
|
required
|
Raises:
Type |
Description |
ValueError
|
If the specified HUB model does not exist.
|
Source code in ultralytics/hub/session.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 | def load_model(self, model_id):
"""
Load an existing model from Ultralytics HUB using the provided model identifier.
Args:
model_id (str): The identifier of the model to load.
Raises:
ValueError: If the specified HUB model does not exist.
"""
self.model = self.client.model(model_id)
if not self.model.data: # then model does not exist
raise ValueError(emojis("❌ The specified HUB model does not exist")) # TODO: improve error handling
self.model_url = f"{HUB_WEB_ROOT}/models/{self.model.id}"
if self.model.is_trained():
LOGGER.info(f"Loading trained HUB model {self.model_url} 🚀")
url = self.model.get_weights_url("best") # download URL with auth
self.model_file = checks.check_file(url, download_dir=Path(SETTINGS["weights_dir"]) / "hub" / self.model.id)
return
# Set training args and start heartbeats for HUB to monitor agent
self._set_train_args()
self.model.start_heartbeat(self.rate_limits["heartbeat"])
LOGGER.info(f"{PREFIX}View model at {self.model_url} 🚀")
|
request_queue
request_queue(
request_func,
retry=3,
timeout=30,
thread=True,
verbose=True,
progress_total=None,
stream_response=None,
*args,
**kwargs
)
Attempt to execute request_func
with retries, timeout handling, optional threading, and progress tracking.
Parameters:
Name |
Type |
Description |
Default |
request_func
|
callable
|
|
required
|
retry
|
int
|
Number of retry attempts.
|
3
|
timeout
|
int
|
Maximum time to wait for the request to complete.
|
30
|
thread
|
bool
|
Whether to run the request in a separate thread.
|
True
|
verbose
|
bool
|
Whether to log detailed messages.
|
True
|
progress_total
|
int
|
Total size for progress tracking.
|
None
|
stream_response
|
bool
|
Whether to stream the response.
|
None
|
*args
|
Any
|
Additional positional arguments for request_func.
|
()
|
**kwargs
|
Any
|
Additional keyword arguments for request_func.
|
{}
|
Returns:
Type |
Description |
Response | None
|
The response object if thread=False, otherwise None.
|
Source code in ultralytics/hub/session.py
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326 | def request_queue(
self,
request_func,
retry=3,
timeout=30,
thread=True,
verbose=True,
progress_total=None,
stream_response=None,
*args,
**kwargs,
):
"""
Attempt to execute `request_func` with retries, timeout handling, optional threading, and progress tracking.
Args:
request_func (callable): The function to execute.
retry (int): Number of retry attempts.
timeout (int): Maximum time to wait for the request to complete.
thread (bool): Whether to run the request in a separate thread.
verbose (bool): Whether to log detailed messages.
progress_total (int, optional): Total size for progress tracking.
stream_response (bool, optional): Whether to stream the response.
*args (Any): Additional positional arguments for request_func.
**kwargs (Any): Additional keyword arguments for request_func.
Returns:
(requests.Response | None): The response object if thread=False, otherwise None.
"""
def retry_request():
"""Attempt to call `request_func` with retries, timeout, and optional threading."""
t0 = time.time() # Record the start time for the timeout
response = None
for i in range(retry + 1):
if (time.time() - t0) > timeout:
LOGGER.warning(f"{PREFIX}Timeout for request reached. {HELP_MSG}")
break # Timeout reached, exit loop
response = request_func(*args, **kwargs)
if response is None:
LOGGER.warning(f"{PREFIX}Received no response from the request. {HELP_MSG}")
time.sleep(2**i) # Exponential backoff before retrying
continue # Skip further processing and retry
if progress_total:
self._show_upload_progress(progress_total, response)
elif stream_response:
self._iterate_content(response)
if HTTPStatus.OK <= response.status_code < HTTPStatus.MULTIPLE_CHOICES:
# if request related to metrics upload
if kwargs.get("metrics"):
self.metrics_upload_failed_queue = {}
return response # Success, no need to retry
if i == 0:
# Initial attempt, check status code and provide messages
message = self._get_failure_message(response, retry, timeout)
if verbose:
LOGGER.warning(f"{PREFIX}{message} {HELP_MSG} ({response.status_code})")
if not self._should_retry(response.status_code):
LOGGER.warning(f"{PREFIX}Request failed. {HELP_MSG} ({response.status_code}")
break # Not an error that should be retried, exit loop
time.sleep(2**i) # Exponential backoff for retries
# if request related to metrics upload and exceed retries
if response is None and kwargs.get("metrics"):
self.metrics_upload_failed_queue.update(kwargs.get("metrics"))
return response
if thread:
# Start a new thread to run the retry_request function
threading.Thread(target=retry_request, daemon=True).start()
else:
# If running in the main thread, call retry_request directly
return retry_request()
|
upload_metrics
Upload model metrics to Ultralytics HUB.
Source code in ultralytics/hub/session.py
| def upload_metrics(self):
"""Upload model metrics to Ultralytics HUB."""
return self.request_queue(self.model.upload_metrics, metrics=self.metrics_queue.copy(), thread=True)
|
upload_model
upload_model(
epoch: int,
weights: str,
is_best: bool = False,
map: float = 0.0,
final: bool = False,
) -> None
Upload a model checkpoint to Ultralytics HUB.
Parameters:
Name |
Type |
Description |
Default |
epoch
|
int
|
The current training epoch.
|
required
|
weights
|
str
|
Path to the model weights file.
|
required
|
is_best
|
bool
|
Indicates if the current model is the best one so far.
|
False
|
map
|
float
|
Mean average precision of the model.
|
0.0
|
final
|
bool
|
Indicates if the model is the final model after training.
|
False
|
Source code in ultralytics/hub/session.py
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421 | def upload_model(
self,
epoch: int,
weights: str,
is_best: bool = False,
map: float = 0.0,
final: bool = False,
) -> None:
"""
Upload a model checkpoint to Ultralytics HUB.
Args:
epoch (int): The current training epoch.
weights (str): Path to the model weights file.
is_best (bool): Indicates if the current model is the best one so far.
map (float): Mean average precision of the model.
final (bool): Indicates if the model is the final model after training.
"""
weights = Path(weights)
if not weights.is_file():
last = weights.with_name(f"last{weights.suffix}")
if final and last.is_file():
LOGGER.warning(
f"{PREFIX} Model 'best.pt' not found, copying 'last.pt' to 'best.pt' and uploading. "
"This often happens when resuming training in transient environments like Google Colab. "
"For more reliable training, consider using Ultralytics HUB Cloud. "
"Learn more at https://docs.ultralytics.com/hub/cloud-training."
)
shutil.copy(last, weights) # copy last.pt to best.pt
else:
LOGGER.warning(f"{PREFIX} Model upload issue. Missing model {weights}.")
return
self.request_queue(
self.model.upload_model,
epoch=epoch,
weights=str(weights),
is_best=is_best,
map=map,
final=final,
retry=10,
timeout=3600,
thread=not final,
progress_total=weights.stat().st_size if final else None, # only show progress if final
stream_response=True,
)
|