model_id = "mlx-community/Nemotron-3-Ultra-550B-A55B-4bit"
n_layers = 108
hidden_size = 8192
num_key_value_heads = 2
supports_tensor = true
tasks = ["TextGeneration"]
backends = ["MlxMetal", "MlxCuda", "MlxCpu"]
family = "nemotron"
quantization = "4bit"
base_model = "NVIDIA-Nemotron/Nemotron-Ultra-3-mopd-052726-mixed_nvfp4-fp8_amax_1000x65k"
capabilities = ["text"]
context_length = 262144
chat_template_path = "chat_templates/nemotron-ultra-3.jinja"

[storage_size]
in_bytes = 346959130136

# Uniform MLX-native affine int4 (group_size 32) — loads in STOCK mlx_lm:
#   - 512 routed MoE experts: affine int4 (re-quantized from the source two-level
#     NVFP4 via faithful dequant).
#   - Mamba in/out_proj, attention q/k/v/o, shared experts, MoE latent: affine int4.
#   - conv1d, router gate, embeddings, lm_head: bf16.
# No nvfp4, no weight_scale_2 -> stock mlx_lm loads it strict=True.
[sampling_defaults]
temperature = 1.0
top_p = 0.95

# MTP speculative decoding via the Nemotron-H nextn layer; loads bf16 mtp.*
# weights from exolabs/Nemotron-3-Ultra-MTP-mlx and quantizes the head at load.
[mtp]
num_speculative_tokens = 1
method = "mtp"
mtp_weights_model_id = "exolabs/Nemotron-3-Ultra-MTP-mlx"