model_id = "mlx-community/Nemotron-3-Ultra-550B-A55B-4bit" n_layers = 108 hidden_size = 8192 num_key_value_heads = 2 supports_tensor = true tasks = ["TextGeneration"] backends = ["MlxMetal", "MlxCuda", "MlxCpu"] family = "nemotron" quantization = "4bit" base_model = "NVIDIA-Nemotron/Nemotron-Ultra-3-mopd-052726-mixed_nvfp4-fp8_amax_1000x65k" capabilities = ["text"] context_length = 262144 chat_template_path = "chat_templates/nemotron-ultra-3.jinja" [storage_size] in_bytes = 346959130136 # Uniform MLX-native affine int4 (group_size 32) — loads in STOCK mlx_lm: # - 512 routed MoE experts: affine int4 (re-quantized from the source two-level # NVFP4 via faithful dequant). # - Mamba in/out_proj, attention q/k/v/o, shared experts, MoE latent: affine int4. # - conv1d, router gate, embeddings, lm_head: bf16. # No nvfp4, no weight_scale_2 -> stock mlx_lm loads it strict=True. [sampling_defaults] temperature = 1.0 top_p = 0.95 # MTP speculative decoding via the Nemotron-H nextn layer; loads bf16 mtp.* # weights from exolabs/Nemotron-3-Ultra-MTP-mlx and quantizes the head at load. [mtp] num_speculative_tokens = 1 method = "mtp" mtp_weights_model_id = "exolabs/Nemotron-3-Ultra-MTP-mlx"