#!/bin/bash ROOT_DIR=/workspace/hanrui/junquan/SpecForge export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels export SPECFORGE_DATA_NUM_PROC=16 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True export PATH=/workspace/hanrui/specforge/bin:$PATH export PYTHONPATH=$ROOT_DIR:$PYTHONPATH NUM_GPUS=${1:-8} /workspace/hanrui/specforge/bin/python3 -m torch.distributed.run \ --standalone \ --nproc_per_node $NUM_GPUS \ $ROOT_DIR/scripts/train_dflash_lora.py \ --model-path /workspace/Qwen3-8B \ --train-data-path /workspace/hanrui/datasets/Nemotron-CodeAlpaca-qwen3-8b-800K \ --output-dir $ROOT_DIR/outputs/qwen3-8b-dflash-lora \ --lora-config $ROOT_DIR/configs/qwen3-8b-dflash-lora.json \ --block-size 16 \ --max-length 2048 \ --batch-size 1 \ --num-epochs 3 \ --learning-rate 2e-4 \ --accumulation-steps 8 \ --loss-decay-gamma 7 \ --attention-backend flex_attention \ --lm-head-chunk-size 256 \ --gradient-checkpointing \ --chat-template qwen \ --log-interval 50 \ --save-interval 500 \ --cache-dir $ROOT_DIR/cache