LuyiCui commited on
Commit
96b27a6
·
verified ·
1 Parent(s): 6a97fe7

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ model_name: DeepSeek-R1-Distill-Qwen-1.5B-DPO
4
+ tags:
5
+ - generated_from_trainer
6
+ - trl
7
+ - dpo
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for DeepSeek-R1-Distill-Qwen-1.5B-DPO
12
+
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="LuyiCui/DeepSeek-R1-Distill-Qwen-1.5B-DPO", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/cuiluyi/open-r1/runs/kfi2ffhj)
30
+
31
+
32
+ This model was trained with DPO, a method introduced in [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://huggingface.co/papers/2305.18290).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.17.0.dev0
37
+ - Transformers: 4.51.2
38
+ - Pytorch: 2.6.0
39
+ - Datasets: 3.5.0
40
+ - Tokenizers: 0.21.1
41
+
42
+ ## Citations
43
+
44
+ Cite DPO as:
45
+
46
+ ```bibtex
47
+ @inproceedings{rafailov2023direct,
48
+ title = {{Direct Preference Optimization: Your Language Model is Secretly a Reward Model}},
49
+ author = {Rafael Rafailov and Archit Sharma and Eric Mitchell and Christopher D. Manning and Stefano Ermon and Chelsea Finn},
50
+ year = 2023,
51
+ booktitle = {Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans, LA, USA, December 10 - 16, 2023},
52
+ url = {http://papers.nips.cc/paper_files/paper/2023/hash/a85b405ed65c6477a4fe8302b5e06ce7-Abstract-Conference.html},
53
+ editor = {Alice Oh and Tristan Naumann and Amir Globerson and Kate Saenko and Moritz Hardt and Sergey Levine},
54
+ }
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9523809523809523,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.6895670831203461,
5
+ "train_runtime": 273.9395,
6
+ "train_samples": 4000,
7
+ "train_samples_per_second": 14.602,
8
+ "train_steps_per_second": 0.037
9
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.51.2"
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9523809523809523,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.6895670831203461,
5
+ "train_runtime": 273.9395,
6
+ "train_samples": 4000,
7
+ "train_samples_per_second": 14.602,
8
+ "train_steps_per_second": 0.037
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9523809523809523,
6
+ "eval_steps": 500,
7
+ "global_step": 10,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09523809523809523,
14
+ "grad_norm": 10.040461540222168,
15
+ "learning_rate": 0.0,
16
+ "logits/chosen": 0.5517578125,
17
+ "logits/rejected": 0.5625,
18
+ "logps/chosen": -384.5,
19
+ "logps/rejected": -438.0,
20
+ "loss": 0.6982,
21
+ "rewards/accuracies": 0.0833333358168602,
22
+ "rewards/chosen": -0.016666412353515625,
23
+ "rewards/margins": -0.009382247924804688,
24
+ "rewards/rejected": -0.00731658935546875,
25
+ "step": 1
26
+ },
27
+ {
28
+ "epoch": 0.19047619047619047,
29
+ "grad_norm": 8.76193904876709,
30
+ "learning_rate": 1e-06,
31
+ "logits/chosen": 0.54248046875,
32
+ "logits/rejected": 0.54052734375,
33
+ "logps/chosen": -347.5,
34
+ "logps/rejected": -419.5,
35
+ "loss": 0.6898,
36
+ "rewards/accuracies": 0.109375,
37
+ "rewards/chosen": -0.010042190551757812,
38
+ "rewards/margins": 0.00848388671875,
39
+ "rewards/rejected": -0.01854705810546875,
40
+ "step": 2
41
+ },
42
+ {
43
+ "epoch": 0.2857142857142857,
44
+ "grad_norm": 7.859212875366211,
45
+ "learning_rate": 9.728616793536587e-07,
46
+ "logits/chosen": 0.546875,
47
+ "logits/rejected": 0.5322265625,
48
+ "logps/chosen": -372.0,
49
+ "logps/rejected": -420.0,
50
+ "loss": 0.6895,
51
+ "rewards/accuracies": 0.0807291716337204,
52
+ "rewards/chosen": 0.007147789001464844,
53
+ "rewards/margins": 0.0082244873046875,
54
+ "rewards/rejected": -0.0010318756103515625,
55
+ "step": 3
56
+ },
57
+ {
58
+ "epoch": 0.38095238095238093,
59
+ "grad_norm": 8.137142181396484,
60
+ "learning_rate": 8.9471999940354e-07,
61
+ "logits/chosen": 0.50927734375,
62
+ "logits/rejected": 0.55322265625,
63
+ "logps/chosen": -374.5,
64
+ "logps/rejected": -434.5,
65
+ "loss": 0.6903,
66
+ "rewards/accuracies": 0.109375,
67
+ "rewards/chosen": -0.02203369140625,
68
+ "rewards/margins": 0.0063877105712890625,
69
+ "rewards/rejected": -0.028411865234375,
70
+ "step": 4
71
+ },
72
+ {
73
+ "epoch": 0.47619047619047616,
74
+ "grad_norm": 6.646923542022705,
75
+ "learning_rate": 7.75e-07,
76
+ "logits/chosen": 0.45166015625,
77
+ "logits/rejected": 0.46630859375,
78
+ "logps/chosen": -373.5,
79
+ "logps/rejected": -412.5,
80
+ "loss": 0.6938,
81
+ "rewards/accuracies": 0.1093750074505806,
82
+ "rewards/chosen": -0.02127838134765625,
83
+ "rewards/margins": 0.0022153854370117188,
84
+ "rewards/rejected": -0.0235137939453125,
85
+ "step": 5
86
+ },
87
+ {
88
+ "epoch": 0.5714285714285714,
89
+ "grad_norm": 8.178302764892578,
90
+ "learning_rate": 6.281416799501187e-07,
91
+ "logits/chosen": 0.533203125,
92
+ "logits/rejected": 0.544921875,
93
+ "logps/chosen": -387.0,
94
+ "logps/rejected": -442.5,
95
+ "loss": 0.6883,
96
+ "rewards/accuracies": 0.1223958358168602,
97
+ "rewards/chosen": -0.029693603515625,
98
+ "rewards/margins": 0.0110931396484375,
99
+ "rewards/rejected": -0.040740966796875,
100
+ "step": 6
101
+ },
102
+ {
103
+ "epoch": 0.6666666666666666,
104
+ "grad_norm": 9.006941795349121,
105
+ "learning_rate": 4.7185832004988133e-07,
106
+ "logits/chosen": 0.5537109375,
107
+ "logits/rejected": 0.5390625,
108
+ "logps/chosen": -368.0,
109
+ "logps/rejected": -418.0,
110
+ "loss": 0.6898,
111
+ "rewards/accuracies": 0.140625,
112
+ "rewards/chosen": -0.062744140625,
113
+ "rewards/margins": 0.009137153625488281,
114
+ "rewards/rejected": -0.0718994140625,
115
+ "step": 7
116
+ },
117
+ {
118
+ "epoch": 0.7619047619047619,
119
+ "grad_norm": 6.875649929046631,
120
+ "learning_rate": 3.250000000000001e-07,
121
+ "logits/chosen": 0.5703125,
122
+ "logits/rejected": 0.5859375,
123
+ "logps/chosen": -377.5,
124
+ "logps/rejected": -427.0,
125
+ "loss": 0.6864,
126
+ "rewards/accuracies": 0.1614583432674408,
127
+ "rewards/chosen": -0.06878662109375,
128
+ "rewards/margins": 0.01445770263671875,
129
+ "rewards/rejected": -0.083251953125,
130
+ "step": 8
131
+ },
132
+ {
133
+ "epoch": 0.8571428571428571,
134
+ "grad_norm": 7.995317459106445,
135
+ "learning_rate": 2.0528000059645995e-07,
136
+ "logits/chosen": 0.5068359375,
137
+ "logits/rejected": 0.544921875,
138
+ "logps/chosen": -388.0,
139
+ "logps/rejected": -447.0,
140
+ "loss": 0.6867,
141
+ "rewards/accuracies": 0.1822916716337204,
142
+ "rewards/chosen": -0.0706787109375,
143
+ "rewards/margins": 0.016702651977539062,
144
+ "rewards/rejected": -0.0875244140625,
145
+ "step": 9
146
+ },
147
+ {
148
+ "epoch": 0.9523809523809523,
149
+ "grad_norm": 9.462100982666016,
150
+ "learning_rate": 1.2713832064634125e-07,
151
+ "logits/chosen": 0.5576171875,
152
+ "logits/rejected": 0.560546875,
153
+ "logps/chosen": -362.0,
154
+ "logps/rejected": -437.0,
155
+ "loss": 0.6829,
156
+ "rewards/accuracies": 0.1953125,
157
+ "rewards/chosen": -0.0859375,
158
+ "rewards/margins": 0.022670745849609375,
159
+ "rewards/rejected": -0.108642578125,
160
+ "step": 10
161
+ },
162
+ {
163
+ "epoch": 0.9523809523809523,
164
+ "step": 10,
165
+ "total_flos": 0.0,
166
+ "train_loss": 0.6895670831203461,
167
+ "train_runtime": 273.9395,
168
+ "train_samples_per_second": 14.602,
169
+ "train_steps_per_second": 0.037
170
+ }
171
+ ],
172
+ "logging_steps": 1,
173
+ "max_steps": 10,
174
+ "num_input_tokens_seen": 0,
175
+ "num_train_epochs": 1,
176
+ "save_steps": 500,
177
+ "stateful_callbacks": {
178
+ "TrainerControl": {
179
+ "args": {
180
+ "should_epoch_stop": false,
181
+ "should_evaluate": false,
182
+ "should_log": false,
183
+ "should_save": true,
184
+ "should_training_stop": true
185
+ },
186
+ "attributes": {}
187
+ }
188
+ },
189
+ "total_flos": 0.0,
190
+ "train_batch_size": 16,
191
+ "trial_name": null,
192
+ "trial_params": null
193
+ }