j05hr3d commited on
Commit
a1c2ca2
·
verified ·
1 Parent(s): 0ea66c5

Model save

Browse files
Files changed (3) hide show
  1. README.md +18 -0
  2. adapter_model.safetensors +1 -1
  3. trainer_state.json +177 -9
README.md CHANGED
@@ -18,6 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
18
  # SFT-Qwen3-Coder-30B_v1.1
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen3-Coder-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) on the None dataset.
 
 
21
 
22
  ## Model description
23
 
@@ -47,6 +49,22 @@ The following hyperparameters were used during training:
47
  - lr_scheduler_warmup_ratio: 0.03
48
  - num_epochs: 3
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  ### Framework versions
51
 
52
  - PEFT 0.18.0
 
18
  # SFT-Qwen3-Coder-30B_v1.1
19
 
20
  This model is a fine-tuned version of [Qwen/Qwen3-Coder-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-Coder-30B-A3B-Instruct) on the None dataset.
21
+ It achieves the following results on the evaluation set:
22
+ - Loss: 0.5415
23
 
24
  ## Model description
25
 
 
49
  - lr_scheduler_warmup_ratio: 0.03
50
  - num_epochs: 3
51
 
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:------:|:----:|:---------------:|
56
+ | 0.9355 | 0.2857 | 20 | 0.7199 |
57
+ | 0.8105 | 0.5714 | 40 | 0.6498 |
58
+ | 0.7285 | 0.8571 | 60 | 0.6154 |
59
+ | 0.7248 | 1.1429 | 80 | 0.5895 |
60
+ | 0.6648 | 1.4286 | 100 | 0.5702 |
61
+ | 0.6418 | 1.7143 | 120 | 0.5603 |
62
+ | 0.7202 | 2.0 | 140 | 0.5530 |
63
+ | 0.5803 | 2.2857 | 160 | 0.5463 |
64
+ | 0.558 | 2.5714 | 180 | 0.5434 |
65
+ | 0.6506 | 2.8571 | 200 | 0.5415 |
66
+
67
+
68
  ### Framework versions
69
 
70
  - PEFT 0.18.0
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0bea075713616701599d020defa788089a52f81cd9687cfe961ff20e74cfc18d
3
  size 1693023512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfdc26c8451232ecfedddd0913b9e677bb1556b792812178944bb6eb52f8b192
3
  size 1693023512
trainer_state.json CHANGED
@@ -1,14 +1,182 @@
1
  {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 0.05714285714285714,
6
  "eval_steps": 20,
7
- "global_step": 4,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
- "log_history": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  "logging_steps": 20,
13
  "max_steps": 210,
14
  "num_input_tokens_seen": 0,
@@ -29,13 +197,13 @@
29
  "should_epoch_stop": false,
30
  "should_evaluate": false,
31
  "should_log": false,
32
- "should_save": false,
33
- "should_training_stop": false
34
  },
35
  "attributes": {}
36
  }
37
  },
38
- "total_flos": 0,
39
  "train_batch_size": 2,
40
  "trial_name": null,
41
  "trial_params": null
 
1
  {
2
+ "best_global_step": 200,
3
+ "best_metric": 0.5415477156639099,
4
+ "best_model_checkpoint": "j05hr3d/SFT-Qwen3-Coder-30B_v1.1/checkpoint-200",
5
+ "epoch": 3.0,
6
  "eval_steps": 20,
7
+ "global_step": 210,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.2857142857142857,
14
+ "grad_norm": 0.2556939721107483,
15
+ "learning_rate": 9.408866995073891e-05,
16
+ "loss": 0.9355,
17
+ "step": 20
18
+ },
19
+ {
20
+ "epoch": 0.2857142857142857,
21
+ "eval_loss": 0.7198927998542786,
22
+ "eval_runtime": 735.6148,
23
+ "eval_samples_per_second": 0.084,
24
+ "eval_steps_per_second": 0.084,
25
+ "step": 20
26
+ },
27
+ {
28
+ "epoch": 0.5714285714285714,
29
+ "grad_norm": 0.19520319998264313,
30
+ "learning_rate": 8.423645320197044e-05,
31
+ "loss": 0.8105,
32
+ "step": 40
33
+ },
34
+ {
35
+ "epoch": 0.5714285714285714,
36
+ "eval_loss": 0.6497658491134644,
37
+ "eval_runtime": 734.7356,
38
+ "eval_samples_per_second": 0.084,
39
+ "eval_steps_per_second": 0.084,
40
+ "step": 40
41
+ },
42
+ {
43
+ "epoch": 0.8571428571428571,
44
+ "grad_norm": 0.3172638714313507,
45
+ "learning_rate": 7.438423645320197e-05,
46
+ "loss": 0.7285,
47
+ "step": 60
48
+ },
49
+ {
50
+ "epoch": 0.8571428571428571,
51
+ "eval_loss": 0.6153671145439148,
52
+ "eval_runtime": 736.69,
53
+ "eval_samples_per_second": 0.084,
54
+ "eval_steps_per_second": 0.084,
55
+ "step": 60
56
+ },
57
+ {
58
+ "epoch": 1.1428571428571428,
59
+ "grad_norm": 0.5020032525062561,
60
+ "learning_rate": 6.45320197044335e-05,
61
+ "loss": 0.7248,
62
+ "step": 80
63
+ },
64
+ {
65
+ "epoch": 1.1428571428571428,
66
+ "eval_loss": 0.5895159244537354,
67
+ "eval_runtime": 733.9618,
68
+ "eval_samples_per_second": 0.084,
69
+ "eval_steps_per_second": 0.084,
70
+ "step": 80
71
+ },
72
+ {
73
+ "epoch": 1.4285714285714286,
74
+ "grad_norm": 0.512776255607605,
75
+ "learning_rate": 5.467980295566503e-05,
76
+ "loss": 0.6648,
77
+ "step": 100
78
+ },
79
+ {
80
+ "epoch": 1.4285714285714286,
81
+ "eval_loss": 0.5701907277107239,
82
+ "eval_runtime": 736.1956,
83
+ "eval_samples_per_second": 0.084,
84
+ "eval_steps_per_second": 0.084,
85
+ "step": 100
86
+ },
87
+ {
88
+ "epoch": 1.7142857142857144,
89
+ "grad_norm": 0.6210766434669495,
90
+ "learning_rate": 4.482758620689655e-05,
91
+ "loss": 0.6418,
92
+ "step": 120
93
+ },
94
+ {
95
+ "epoch": 1.7142857142857144,
96
+ "eval_loss": 0.5602756142616272,
97
+ "eval_runtime": 738.1818,
98
+ "eval_samples_per_second": 0.084,
99
+ "eval_steps_per_second": 0.084,
100
+ "step": 120
101
+ },
102
+ {
103
+ "epoch": 2.0,
104
+ "grad_norm": 0.4914884567260742,
105
+ "learning_rate": 3.497536945812808e-05,
106
+ "loss": 0.7202,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 2.0,
111
+ "eval_loss": 0.5529556274414062,
112
+ "eval_runtime": 739.8012,
113
+ "eval_samples_per_second": 0.084,
114
+ "eval_steps_per_second": 0.084,
115
+ "step": 140
116
+ },
117
+ {
118
+ "epoch": 2.2857142857142856,
119
+ "grad_norm": 0.24697080254554749,
120
+ "learning_rate": 2.512315270935961e-05,
121
+ "loss": 0.5803,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 2.2857142857142856,
126
+ "eval_loss": 0.5462542176246643,
127
+ "eval_runtime": 744.2538,
128
+ "eval_samples_per_second": 0.083,
129
+ "eval_steps_per_second": 0.083,
130
+ "step": 160
131
+ },
132
+ {
133
+ "epoch": 2.571428571428571,
134
+ "grad_norm": 0.2137288749217987,
135
+ "learning_rate": 1.5270935960591133e-05,
136
+ "loss": 0.558,
137
+ "step": 180
138
+ },
139
+ {
140
+ "epoch": 2.571428571428571,
141
+ "eval_loss": 0.5434364080429077,
142
+ "eval_runtime": 743.0389,
143
+ "eval_samples_per_second": 0.083,
144
+ "eval_steps_per_second": 0.083,
145
+ "step": 180
146
+ },
147
+ {
148
+ "epoch": 2.857142857142857,
149
+ "grad_norm": 0.25142785906791687,
150
+ "learning_rate": 5.418719211822661e-06,
151
+ "loss": 0.6506,
152
+ "step": 200
153
+ },
154
+ {
155
+ "epoch": 2.857142857142857,
156
+ "eval_loss": 0.5415477156639099,
157
+ "eval_runtime": 742.4945,
158
+ "eval_samples_per_second": 0.084,
159
+ "eval_steps_per_second": 0.084,
160
+ "step": 200
161
+ },
162
+ {
163
+ "epoch": 3.0,
164
+ "step": 210,
165
+ "total_flos": 3.407452539457782e+17,
166
+ "train_loss": 0.6992505141666957,
167
+ "train_runtime": 48442.2914,
168
+ "train_samples_per_second": 0.035,
169
+ "train_steps_per_second": 0.004
170
+ },
171
+ {
172
+ "epoch": 3.0,
173
+ "eval_loss": 0.5415477156639099,
174
+ "eval_runtime": 742.9426,
175
+ "eval_samples_per_second": 0.083,
176
+ "eval_steps_per_second": 0.083,
177
+ "step": 210
178
+ }
179
+ ],
180
  "logging_steps": 20,
181
  "max_steps": 210,
182
  "num_input_tokens_seen": 0,
 
197
  "should_epoch_stop": false,
198
  "should_evaluate": false,
199
  "should_log": false,
200
+ "should_save": true,
201
+ "should_training_stop": true
202
  },
203
  "attributes": {}
204
  }
205
  },
206
+ "total_flos": 3.407452539457782e+17,
207
  "train_batch_size": 2,
208
  "trial_name": null,
209
  "trial_params": null