GuiHaokun commited on
Commit
9763ad2
·
verified ·
1 Parent(s): fc736dc

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/haokungui-hong-kong-university-of-science-and-technology/huggingface/runs/f09hw7av)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
@@ -35,9 +35,9 @@ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing
35
  ### Framework versions
36
 
37
  - TRL: 0.16.0.dev0
38
- - Transformers: 4.49.0.dev0
39
  - Pytorch: 2.5.1
40
- - Datasets: 3.3.0
41
  - Tokenizers: 0.21.0
42
 
43
  ## Citations
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/haokungui-hong-kong-university-of-science-and-technology/huggingface/runs/jk6prsw3)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
35
  ### Framework versions
36
 
37
  - TRL: 0.16.0.dev0
38
+ - Transformers: 4.50.0.dev0
39
  - Pytorch: 2.5.1
40
+ - Datasets: 3.3.2
41
  - Tokenizers: 0.21.0
42
 
43
  ## Citations
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": -0.00021707314596730205,
4
- "train_runtime": 6919.2787,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 1.084,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.05106422754711118,
4
+ "train_runtime": 6823.4661,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.099,
7
+ "train_steps_per_second": 0.009
8
  }
config.json CHANGED
@@ -22,7 +22,7 @@
22
  "sliding_window": 4096,
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
- "transformers_version": "4.49.0.dev0",
26
  "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
 
22
  "sliding_window": 4096,
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
+ "transformers_version": "4.50.0.dev0",
26
  "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "bos_token_id": 151643,
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
- "transformers_version": "4.49.0.dev0"
6
  }
 
2
  "bos_token_id": 151643,
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
+ "transformers_version": "4.50.0.dev0"
6
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d09b07690e14ea60f95ff7c3dc4258f9b72af70769f68446415af4a8a5494d21
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:500fdf241a5bd22e2d5b097fb79fbf89bf04ff7aec09676a3c9d431055960c76
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f5f3dee11356461e7f6a397af882541b234f550fd1de5f7c27460cb487f314e0
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1943eb3f4eda46b078c73ab62c768b0210a292c7835ea0ed779e1f005a834eae
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9f4d2664767ef7f5583950f6da93e6e0b22fe5d182069746f6170fb483332052
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aace49adaf8c20390da0938e69bd9f53dad03662b47066d5f94eb6effdec3fda
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:58eca92f8ca49d1a98a0d9d47e9434a04253c1a7f4ee8faf31c95651511d547d
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4503159aee19406f709e504ac4c0e2b530d50b19b0dc3e312ca04a929312c0ac
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": -0.00021707314596730205,
4
- "train_runtime": 6919.2787,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 1.084,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.05106422754711118,
4
+ "train_runtime": 6823.4661,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 1.099,
7
+ "train_steps_per_second": 0.009
8
  }
trainer_state.json CHANGED
@@ -9,162 +9,174 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 616.8576179504395,
 
13
  "epoch": 0.08528784648187633,
14
- "grad_norm": 3.189967632293701,
15
- "kl": 0.0002296924591064453,
16
  "learning_rate": 2.5e-06,
17
- "loss": 0.0,
18
- "reward": 0.6506696678698063,
19
- "reward_std": 0.3288835626095533,
20
- "rewards/accuracy_reward": 0.6502232395112515,
21
  "rewards/format_reward": 0.00044642859138548373,
22
  "step": 5
23
  },
24
  {
25
- "completion_length": 616.0665504455567,
 
26
  "epoch": 0.17057569296375266,
27
- "grad_norm": 1.5170903205871582,
28
- "kl": 0.017084503173828126,
29
  "learning_rate": 2.956412726139078e-06,
30
- "loss": 0.0007,
31
- "reward": 0.7037946730852127,
32
- "reward_std": 0.28115556947886944,
33
- "rewards/accuracy_reward": 0.7037946730852127,
34
  "rewards/format_reward": 0.0,
35
  "step": 10
36
  },
37
  {
38
- "completion_length": 619.1498023986817,
 
39
  "epoch": 0.255863539445629,
40
- "grad_norm": 0.1955304741859436,
41
- "kl": 0.015854263305664064,
42
  "learning_rate": 2.7836719084521715e-06,
43
- "loss": 0.0006,
44
- "reward": 0.7582589656114578,
45
- "reward_std": 0.21650202702730895,
46
- "rewards/accuracy_reward": 0.7582589656114578,
47
  "rewards/format_reward": 0.0,
48
  "step": 15
49
  },
50
  {
51
- "completion_length": 609.8542671203613,
 
52
  "epoch": 0.3411513859275053,
53
- "grad_norm": 0.2464970499277115,
54
- "kl": 0.003617095947265625,
55
  "learning_rate": 2.4946839873611927e-06,
56
- "loss": 0.0001,
57
- "reward": 0.7537946790456772,
58
- "reward_std": 0.22740934304893018,
59
- "rewards/accuracy_reward": 0.7537946790456772,
60
  "rewards/format_reward": 0.0,
61
  "step": 20
62
  },
63
  {
64
- "completion_length": 613.51654586792,
 
65
  "epoch": 0.42643923240938164,
66
- "grad_norm": 0.8888379335403442,
67
- "kl": 0.005077743530273437,
68
  "learning_rate": 2.1156192081791355e-06,
69
- "loss": 0.0002,
70
- "reward": 0.7627232491970062,
71
- "reward_std": 0.18942257491871714,
72
- "rewards/accuracy_reward": 0.7627232491970062,
73
  "rewards/format_reward": 0.0,
74
  "step": 25
75
  },
76
  {
77
- "completion_length": 612.561190032959,
 
78
  "epoch": 0.511727078891258,
79
- "grad_norm": 0.6480019688606262,
80
- "kl": 0.0041637420654296875,
81
  "learning_rate": 1.6808050203829845e-06,
82
- "loss": 0.0002,
83
- "reward": 0.7622768208384514,
84
- "reward_std": 0.18584611341357232,
85
- "rewards/accuracy_reward": 0.7622768208384514,
86
  "rewards/format_reward": 0.0,
87
  "step": 30
88
  },
89
  {
90
- "completion_length": 601.9710105895996,
 
91
  "epoch": 0.5970149253731343,
92
- "grad_norm": 0.19176428020000458,
93
- "kl": 0.003966903686523438,
94
  "learning_rate": 1.2296174432791415e-06,
95
- "loss": 0.0002,
96
- "reward": 0.755357176810503,
97
- "reward_std": 0.16201179698109627,
98
- "rewards/accuracy_reward": 0.755357176810503,
99
  "rewards/format_reward": 0.0,
100
  "step": 35
101
  },
102
  {
103
- "completion_length": 588.7897583007813,
 
104
  "epoch": 0.6823027718550106,
105
- "grad_norm": 0.5784619450569153,
106
- "kl": 0.0035259246826171873,
107
  "learning_rate": 8.029152419343472e-07,
108
- "loss": 0.0001,
109
- "reward": 0.7720982477068901,
110
- "reward_std": 0.17176652811467646,
111
- "rewards/accuracy_reward": 0.7720982477068901,
112
  "rewards/format_reward": 0.0,
113
  "step": 40
114
  },
115
  {
116
- "completion_length": 608.5564971923828,
 
117
  "epoch": 0.767590618336887,
118
- "grad_norm": 0.2929813861846924,
119
- "kl": 0.0034820556640625,
120
  "learning_rate": 4.3933982822017883e-07,
121
- "loss": 0.0001,
122
- "reward": 0.7569196790456771,
123
- "reward_std": 0.17854769751429558,
124
- "rewards/accuracy_reward": 0.7569196790456771,
125
  "rewards/format_reward": 0.0,
126
  "step": 45
127
  },
128
  {
129
- "completion_length": 609.3294929504394,
 
130
  "epoch": 0.8528784648187633,
131
- "grad_norm": 0.8820053339004517,
132
- "kl": 0.003635406494140625,
133
  "learning_rate": 1.718159615201853e-07,
134
- "loss": 0.0001,
135
- "reward": 0.7529018223285675,
136
- "reward_std": 0.18069615559652447,
137
- "rewards/accuracy_reward": 0.7529018223285675,
138
  "rewards/format_reward": 0.0,
139
  "step": 50
140
  },
141
  {
142
- "completion_length": 597.9283767700196,
 
143
  "epoch": 0.9381663113006397,
144
- "grad_norm": 0.2659400999546051,
145
- "kl": 0.0099273681640625,
146
  "learning_rate": 2.4570139579284723e-08,
147
- "loss": 0.0004,
148
- "reward": 0.7859375342726708,
149
- "reward_std": 0.17906498536467552,
150
- "rewards/accuracy_reward": 0.7859375342726708,
151
  "rewards/format_reward": 0.0,
152
  "step": 55
153
  },
154
  {
155
- "completion_length": 592.1878992716471,
 
156
  "epoch": 0.9893390191897654,
157
- "kl": 0.0038731892903645835,
158
- "reward": 0.772693489988645,
159
- "reward_std": 0.1824363498017192,
160
- "rewards/accuracy_reward": 0.772693489988645,
161
  "rewards/format_reward": 0.0,
162
  "step": 58,
163
  "total_flos": 0.0,
164
- "train_loss": -0.00021707314596730205,
165
- "train_runtime": 6919.2787,
166
- "train_samples_per_second": 1.084,
167
- "train_steps_per_second": 0.008
168
  }
169
  ],
170
  "logging_steps": 5,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "clip_ratio": 0.0,
13
+ "completion_length": 613.3375259399414,
14
  "epoch": 0.08528784648187633,
15
+ "grad_norm": 0.5308949947357178,
16
+ "kl": 0.00026957988739013673,
17
  "learning_rate": 2.5e-06,
18
+ "loss": 0.0422,
19
+ "reward": 0.6412946701049804,
20
+ "reward_std": 0.33645378351211547,
21
+ "rewards/accuracy_reward": 0.6408482432365418,
22
  "rewards/format_reward": 0.00044642859138548373,
23
  "step": 5
24
  },
25
  {
26
+ "clip_ratio": 0.0,
27
+ "completion_length": 611.9799339294434,
28
  "epoch": 0.17057569296375266,
29
+ "grad_norm": 3.3914427757263184,
30
+ "kl": 0.002694988250732422,
31
  "learning_rate": 2.956412726139078e-06,
32
+ "loss": 0.0675,
33
+ "reward": 0.7015625357627868,
34
+ "reward_std": 0.2750119812786579,
35
+ "rewards/accuracy_reward": 0.7015625357627868,
36
  "rewards/format_reward": 0.0,
37
  "step": 10
38
  },
39
  {
40
+ "clip_ratio": 0.0,
41
+ "completion_length": 598.9616317749023,
42
  "epoch": 0.255863539445629,
43
+ "grad_norm": 0.3315448462963104,
44
+ "kl": 0.0043125152587890625,
45
  "learning_rate": 2.7836719084521715e-06,
46
+ "loss": 0.0621,
47
+ "reward": 0.764285746216774,
48
+ "reward_std": 0.21017537489533425,
49
+ "rewards/accuracy_reward": 0.764285746216774,
50
  "rewards/format_reward": 0.0,
51
  "step": 15
52
  },
53
  {
54
+ "clip_ratio": 0.0,
55
+ "completion_length": 588.717440032959,
56
  "epoch": 0.3411513859275053,
57
+ "grad_norm": 0.8005821704864502,
58
+ "kl": 0.004384231567382812,
59
  "learning_rate": 2.4946839873611927e-06,
60
+ "loss": 0.0533,
61
+ "reward": 0.7665178924798965,
62
+ "reward_std": 0.1955874715000391,
63
+ "rewards/accuracy_reward": 0.7665178924798965,
64
  "rewards/format_reward": 0.0,
65
  "step": 20
66
  },
67
  {
68
+ "clip_ratio": 0.0,
69
+ "completion_length": 592.9370819091797,
70
  "epoch": 0.42643923240938164,
71
+ "grad_norm": 0.7205191850662231,
72
+ "kl": 0.004600143432617188,
73
  "learning_rate": 2.1156192081791355e-06,
74
+ "loss": 0.045,
75
+ "reward": 0.7629464656114578,
76
+ "reward_std": 0.18942818641662598,
77
+ "rewards/accuracy_reward": 0.7629464656114578,
78
  "rewards/format_reward": 0.0,
79
  "step": 25
80
  },
81
  {
82
+ "clip_ratio": 0.0,
83
+ "completion_length": 598.0611862182617,
84
  "epoch": 0.511727078891258,
85
+ "grad_norm": 0.6142191886901855,
86
+ "kl": 0.042702865600585935,
87
  "learning_rate": 1.6808050203829845e-06,
88
+ "loss": 0.0428,
89
+ "reward": 0.7551339700818062,
90
+ "reward_std": 0.1889859580434859,
91
+ "rewards/accuracy_reward": 0.7551339700818062,
92
  "rewards/format_reward": 0.0,
93
  "step": 30
94
  },
95
  {
96
+ "clip_ratio": 0.0,
97
+ "completion_length": 593.2520332336426,
98
  "epoch": 0.5970149253731343,
99
+ "grad_norm": 10.091300964355469,
100
+ "kl": 0.4265655517578125,
101
  "learning_rate": 1.2296174432791415e-06,
102
+ "loss": 0.0673,
103
+ "reward": 0.7410714656114579,
104
+ "reward_std": 0.179294466227293,
105
+ "rewards/accuracy_reward": 0.7410714656114579,
106
  "rewards/format_reward": 0.0,
107
  "step": 35
108
  },
109
  {
110
+ "clip_ratio": 0.0,
111
+ "completion_length": 577.117886352539,
112
  "epoch": 0.6823027718550106,
113
+ "grad_norm": 81.48939514160156,
114
+ "kl": 0.06499862670898438,
115
  "learning_rate": 8.029152419343472e-07,
116
+ "loss": 0.0492,
117
+ "reward": 0.7669643267989159,
118
+ "reward_std": 0.17745222924277188,
119
+ "rewards/accuracy_reward": 0.7669643267989159,
120
  "rewards/format_reward": 0.0,
121
  "step": 40
122
  },
123
  {
124
+ "clip_ratio": 0.0,
125
+ "completion_length": 594.6520378112793,
126
  "epoch": 0.767590618336887,
127
+ "grad_norm": 59.926361083984375,
128
+ "kl": 0.22182693481445312,
129
  "learning_rate": 4.3933982822017883e-07,
130
+ "loss": 0.0426,
131
+ "reward": 0.7529018267989158,
132
+ "reward_std": 0.1793302634730935,
133
+ "rewards/accuracy_reward": 0.7529018267989158,
134
  "rewards/format_reward": 0.0,
135
  "step": 45
136
  },
137
  {
138
+ "clip_ratio": 0.0,
139
+ "completion_length": 599.8544906616211,
140
  "epoch": 0.8528784648187633,
141
+ "grad_norm": 0.3387981653213501,
142
+ "kl": 0.171990966796875,
143
  "learning_rate": 1.718159615201853e-07,
144
+ "loss": 0.0534,
145
+ "reward": 0.7386161029338837,
146
+ "reward_std": 0.18926974572241306,
147
+ "rewards/accuracy_reward": 0.7386161029338837,
148
  "rewards/format_reward": 0.0,
149
  "step": 50
150
  },
151
  {
152
+ "clip_ratio": 0.0,
153
+ "completion_length": 590.5582893371582,
154
  "epoch": 0.9381663113006397,
155
+ "grad_norm": 7.056384563446045,
156
+ "kl": 0.15600433349609374,
157
  "learning_rate": 2.4570139579284723e-08,
158
+ "loss": 0.0572,
159
+ "reward": 0.7738839611411095,
160
+ "reward_std": 0.18812613375484943,
161
+ "rewards/accuracy_reward": 0.7738839611411095,
162
  "rewards/format_reward": 0.0,
163
  "step": 55
164
  },
165
  {
166
+ "clip_ratio": 0.0,
167
+ "completion_length": 576.9970359802246,
168
  "epoch": 0.9893390191897654,
169
+ "kl": 0.025601704915364582,
170
+ "reward": 0.7589286093910536,
171
+ "reward_std": 0.1643078220076859,
172
+ "rewards/accuracy_reward": 0.7589286093910536,
173
  "rewards/format_reward": 0.0,
174
  "step": 58,
175
  "total_flos": 0.0,
176
+ "train_loss": 0.05106422754711118,
177
+ "train_runtime": 6823.4661,
178
+ "train_samples_per_second": 1.099,
179
+ "train_steps_per_second": 0.009
180
  }
181
  ],
182
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f18f62535d61e36f08936af2cc23013ccba1fa668e22155a15ff711fb1075a93
3
- size 7608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f50a1c1b961775d6dcb6bb20b19456022d781be9766c958e14e48133d49616e8
3
+ size 7736