vocab_size: 50257 d_model: 768 n_layer: 24 num_experts: 4 top_k: 1 d_ff: 2304 ssm_d_state: 16 ssm_expand: 2 load_balancing_coef: 0.0 router_z_loss_coef: 0.0 max_seq_len: 1024 dtype: "float16" use_cpu_offload: true # Offload to CPU during inference to save VRAM gradient_checkpointing: false checkpoint_ssm_layers: false use_flash_attention: true