vocab_size: 50257
d_model: 768
n_layer: 24
num_experts: 4
top_k: 1
d_ff: 2304
ssm_d_state: 16
ssm_expand: 2
load_balancing_coef: 0.0
router_z_loss_coef: 0.0
max_seq_len: 1024
dtype: "float16"
use_cpu_offload: true # Offload to CPU during inference to save VRAM
gradient_checkpointing: false
checkpoint_ssm_layers: false
use_flash_attention: true