Meet YOLO26: next-gen vision AI.

Reference for ultralytics/utils/export/qnn.py

Improvements

This page is sourced from https://github.com/ultralytics/ultralytics/blob/main/ultralytics/utils/export/qnn.py. Have an improvement or example to add? Open a Pull Request — thank you! 🙏


Summary

Function ultralytics.utils.export.qnn.qnn_library_paths

def qnn_library_paths() -> tuple[str, str]

Resolve the QNN Execution Provider and HTP backend library paths for the installed onnxruntime-qnn build.

onnxruntime-qnn ships two ways: the Windows/Linux-aarch64 wheels expose an onnxruntime_qnn helper module, while the Linux x86-64 build bundles the libraries in onnxruntime/capi. Both register the QNN Execution Provider via register_execution_provider_library; only the library locations differ.

Returns

TypeDescription
tuple[str, str](ep_library_path, htp_backend_path) for register_execution_provider_library and the QNN
Source code in ultralytics/utils/export/qnn.py

View on GitHub

def qnn_library_paths() -> tuple[str, str]:
    """Resolve the QNN Execution Provider and HTP backend library paths for the installed onnxruntime-qnn build.

    onnxruntime-qnn ships two ways: the Windows/Linux-aarch64 wheels expose an `onnxruntime_qnn` helper module, while
    the Linux x86-64 build bundles the libraries in `onnxruntime/capi`. Both register the QNN Execution Provider via
    `register_execution_provider_library`; only the library locations differ.

    Returns:
        (tuple[str, str]): `(ep_library_path, htp_backend_path)` for `register_execution_provider_library` and the QNN
            HTP `backend_path` provider option.
    """
    try:
        import onnxruntime_qnn as qnn_ep

        return qnn_ep.get_library_path(), qnn_ep.get_qnn_htp_path()
    except ImportError:
        import onnxruntime

        capi = Path(onnxruntime.__file__).parent / "capi"
        ep_lib = "onnxruntime_providers_qnn.dll" if WINDOWS else "libonnxruntime_providers_qnn.so"
        htp_lib = "QnnHtp.dll" if WINDOWS else "libQnnHtp.so"
        return str(capi / ep_lib), str(capi / htp_lib)





Function ultralytics.utils.export.qnn.onnx2qnn

def onnx2qnn(
    onnx_file: str | Path,
    output_dir: Path | str,
    dataset,
    transform_fn,
    name: str = "73",
    metadata: dict | None = None,
    prefix: str = "",
) -> str

Convert an ONNX model to an INT8 Qualcomm QNN context binary using the ONNX Runtime QNN Execution Provider.

The conversion runs entirely on the host with no Qualcomm account or cloud upload. The model is INT8-quantized with ONNX Runtime's QNN QDQ flow (the Hexagon NPU is an int8 accelerator), then the onnxruntime-qnn plugin Execution Provider — which bundles the Qualcomm AI Runtime (QAIRT) libraries and is registered at runtime — compiles the quantized graph into a QNN context binary embedded in <stem>_qnn.onnx. No inference is run.

Args

NameTypeDescriptionDefault
onnx_file`strPath`Path to the source ONNX file (already exported).
output_dir`Pathstr`Directory to save the exported QNN model.
datasetDataLoaderCalibration dataloader (from Exporter.get_int8_calibration_dataloader) used for INT8
quantization.
required
transform_fnCallablePreprocessing transform (Exporter._transform_fn) converting a calibration item to a
normalized float32 NCHW array.
required
namestrTarget Hexagon Tensor Processor (HTP) architecture version, e.g. "73" (Snapdragon 8 Gen 2), "75"
(8 Gen 3), "79" (8 Elite). Finalizes the graph for the target chip when exporting on a host without a
Snapdragon NPU.
"73"
metadata`dictNone`Metadata saved as metadata.yaml.
prefixstrPrefix for log messages.""

Returns

TypeDescription
strPath to the exported _qnn_model directory.
Notes

onnxruntime-qnn ships prebuilt wheels for Windows (x64/ARM64) and Linux ARM64 (aarch64) only. There is no Linux x86-64 or macOS wheel — on those hosts build ONNX Runtime from source with --use_qnn, or generate the context binary on a supported platform.

Source code in ultralytics/utils/export/qnn.py

View on GitHub

def onnx2qnn(
    onnx_file: str | Path,
    output_dir: Path | str,
    dataset,
    transform_fn,
    name: str = "73",
    metadata: dict | None = None,
    prefix: str = "",
) -> str:
    """Convert an ONNX model to an INT8 Qualcomm QNN context binary using the ONNX Runtime QNN Execution Provider.

    The conversion runs entirely on the host with no Qualcomm account or cloud upload. The model is INT8-quantized with
    ONNX Runtime's QNN QDQ flow (the Hexagon NPU is an int8 accelerator), then the `onnxruntime-qnn` plugin Execution
    Provider — which bundles the Qualcomm AI Runtime (QAIRT) libraries and is registered at runtime — compiles the
    quantized graph into a QNN context binary embedded in `<stem>_qnn.onnx`. No inference is run.

    Args:
        onnx_file (str | Path): Path to the source ONNX file (already exported).
        output_dir (Path | str): Directory to save the exported QNN model.
        dataset (DataLoader): Calibration dataloader (from `Exporter.get_int8_calibration_dataloader`) used for INT8
            quantization.
        transform_fn (Callable): Preprocessing transform (`Exporter._transform_fn`) converting a calibration item to a
            normalized `float32` NCHW array.
        name (str): Target Hexagon Tensor Processor (HTP) architecture version, e.g. `"73"` (Snapdragon 8 Gen 2), `"75"`
            (8 Gen 3), `"79"` (8 Elite). Finalizes the graph for the target chip when exporting on a host without a
            Snapdragon NPU.
        metadata (dict | None): Metadata saved as `metadata.yaml`.
        prefix (str): Prefix for log messages.

    Returns:
        (str): Path to the exported `_qnn_model` directory.

    Notes:
        `onnxruntime-qnn` ships prebuilt wheels for Windows (x64/ARM64) and Linux ARM64 (aarch64) only. There is no
        Linux x86-64 or macOS wheel — on those hosts build ONNX Runtime from source with `--use_qnn`, or generate the
        context binary on a supported platform.
    """
    check_requirements("onnxruntime-qnn")
    import onnxruntime as ort
    from onnxruntime.quantization import CalibrationDataReader, quantize
    from onnxruntime.quantization.execution_providers.qnn import get_qnn_qdq_config
    from onnxruntime.quantization.shape_inference import quant_pre_process

    ep_library, htp_backend = qnn_library_paths()

    onnx_file = Path(onnx_file)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    ctx_file = output_dir / f"{onnx_file.stem}_qnn.onnx"
    pre_file = output_dir / "preprocessed.onnx"
    qdq_file = output_dir / "qdq.onnx"

    # INT8 QDQ quantization (HTP is an int8 accelerator). Reuses the shared calibration dataloader + transform_fn.
    class _CalibrationReader(CalibrationDataReader):
        def __init__(self):
            """Materialize calibration inputs as `{input_name: float32_NCHW}` dicts."""
            self.samples = [{"images": transform_fn(batch)} for batch in dataset]
            self.iterator = iter(self.samples)

        def get_next(self):
            """Return the next calibration sample, or None when exhausted."""
            return next(self.iterator, None)

        def rewind(self):
            """Reset the iterator for an additional calibration pass."""
            self.iterator = iter(self.samples)

    LOGGER.info(f"\n{prefix} starting INT8 quantization and export with ONNX Runtime QNN (HTP arch {name})...")
    quant_pre_process(str(onnx_file), str(pre_file))
    qdq_config = get_qnn_qdq_config(str(pre_file), _CalibrationReader())
    quantize(str(pre_file), str(qdq_file), qdq_config)

    # Register the QNN EP, then compile the quantized graph to a context binary during session init (no inference run).
    # htp_arch targets the chip so the graph finalizes offline on a host without an NPU, and the shared-memory
    # allocator is disabled (no device present).
    ep_name = "QNNExecutionProvider"
    ep_options = {
        "backend_path": htp_backend,
        "htp_arch": name,
        "htp_graph_finalization_optimization_mode": "3",
        "enable_htp_shared_memory_allocator": "0",
    }
    options = ort.SessionOptions()
    options.add_session_config_entry("ep.context_enable", "1")
    options.add_session_config_entry("ep.context_file_path", str(ctx_file))
    options.add_session_config_entry("ep.context_embed_mode", "1")
    ort.register_execution_provider_library(ep_name, ep_library)
    try:
        devices = [d for d in ort.get_ep_devices() if d.ep_name == ep_name]
        if not devices:
            raise RuntimeError("QNN EP registered but no QNN devices were found by ONNX Runtime.")
        options.add_provider_for_devices(devices, ep_options)
        ort.InferenceSession(str(qdq_file), sess_options=options)
    finally:
        ort.unregister_execution_provider_library(ep_name)

    if not ctx_file.exists():
        raise RuntimeError(f"QNN context binary was not generated at {ctx_file}. See {prefix} logs for details.")

    for f in (pre_file, qdq_file):  # remove quantization intermediates; the context binary is self-contained
        f.unlink(missing_ok=True)
    if metadata:
        YAML.save(output_dir / "metadata.yaml", metadata)
    return str(output_dir)



Contributors