umarigan commited on
Commit
f385cf6
·
verified ·
1 Parent(s): 73a53ab

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +180 -0
README.md CHANGED
@@ -10,6 +10,9 @@ tags:
10
  license: apache-2.0
11
  language:
12
  - en
 
 
 
13
  ---
14
 
15
  # Uploaded model
@@ -21,3 +24,180 @@ language:
21
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
22
 
23
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: apache-2.0
11
  language:
12
  - en
13
+ - tr
14
+ datasets:
15
+ - umarigan/OpenThoughts-43k-TR
16
  ---
17
 
18
  # Uploaded model
 
24
  This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
25
 
26
  [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
27
+
28
+
29
+ Eval results:
30
+ arc-tr = 57.68%
31
+ truthful_qa-tr =
32
+
33
+ following code to reproduce the results:
34
+
35
+ ```python
36
+
37
+ import torch
38
+ from transformers import pipeline
39
+ from datasets import load_dataset
40
+ import re
41
+ import torch
42
+ from transformers import pipeline
43
+
44
+ model_id = "umarigan/llama-3.2-8B-R1-Tr"
45
+ pipe = pipeline(
46
+ "text-generation",
47
+ model=model_id,
48
+ torch_dtype=torch.bfloat16,
49
+ device_map="auto",
50
+ )
51
+
52
+
53
+ #ARC-TR
54
+ ds = load_dataset("mukayese/arc-tr", split ='test')
55
+
56
+ def extract_answer(text):
57
+ """Extract first occurring A-D label from generated text"""
58
+ match = re.search(r'\b([A-D])\b', text, re.IGNORECASE)
59
+ return match.group(1).upper() if match else None
60
+
61
+ total = 0
62
+ correct = 0
63
+
64
+ for example in ds:
65
+ # Format the question and choices
66
+ question = example["question"]
67
+ choices = "\n".join([f"{label}) {text}" for label, text in
68
+ zip(example["choices"]["label"], example["choices"]["text"])])
69
+
70
+ # Create prompt with explicit instruction
71
+ prompt = f"""Answer this multiple-choice question by providing ONLY the letter corresponding to the correct answer (A, B, C, or D). Do not include any explanation.
72
+
73
+ Question: {question}
74
+ Options:
75
+ {choices}
76
+ Answer:"""
77
+
78
+ # Generate response
79
+ messages = [{"role": "user", "content": prompt}]
80
+ try:
81
+ outputs = pipe(
82
+ messages,
83
+ max_new_tokens=5, # Limit response length to get just the answer
84
+ do_sample=False # Disable sampling for more deterministic answers
85
+ )
86
+ response = outputs[0]["generated_text"][-1]['content']
87
+ predicted = extract_answer(response)
88
+ answer = example["answerKey"]
89
+
90
+ # Update counters
91
+ total += 1
92
+ if predicted == answer:
93
+ correct += 1
94
+
95
+ except Exception as e:
96
+ print(f"Error processing example: {e}")
97
+ continue
98
+
99
+ # Print results
100
+ print(f"\nBenchmark Results:")
101
+ print(f"Total questions processed: {total}")
102
+ print(f"Correct answers: {correct}")
103
+ print(f"Accuracy: {correct/total:.2%}" if total > 0 else "No questions processed")
104
+ #output
105
+ #Benchmark Results:
106
+ #Total questions processed: 1172
107
+ #Correct answers: 676
108
+ #Accuracy: 57.68%
109
+
110
+
111
+ #TRUTHFUL-TR
112
+
113
+ import re
114
+ ds2 = load_dataset("mukayese/truthful_qa-tr", split ='validation')
115
+ def evaluate_mc(example, targets_key="mc1_targets"):
116
+ """Evaluate a single multiple-choice example with variable choices"""
117
+ question = example["question"]
118
+ choices = example[targets_key]["choices"]
119
+ labels = example[targets_key]["labels"]
120
+
121
+ # Generate option labels dynamically (A, B, C, ..., G)
122
+ option_labels = [chr(65 + i) for i in range(len(choices))]
123
+
124
+ # Create prompt with explicit instruction
125
+ options_text = "\n".join([f"{label}) {text}" for label, text in zip(option_labels, choices)])
126
+ prompt = f"""Answer this multiple-choice question by selecting the most correct option. Provide only the letter corresponding to your choice ({', '.join(option_labels)}).
127
+
128
+ Question: {question}
129
+ Options:
130
+ {options_text}
131
+ Answer:"""
132
+
133
+ # Generate response
134
+ messages = [{"role": "user", "content": prompt}]
135
+ try:
136
+ outputs = pipe(
137
+ messages,
138
+ max_new_tokens=5, # Limit response length to get just the answer
139
+ do_sample=False # Disable sampling for more deterministic answers
140
+ )
141
+ response = outputs[0]["generated_text"][-1]['content']
142
+
143
+ # Extract predicted label
144
+ predicted = extract_answer(response, option_labels)
145
+ if predicted is None:
146
+ return 0 # Count as incorrect if no valid answer
147
+
148
+ # Get correct answer
149
+ correct_idx = labels.index(1)
150
+ correct_label = option_labels[correct_idx]
151
+
152
+ return int(predicted == correct_label)
153
+
154
+ except Exception as e:
155
+ print(f"Error processing example: {e}")
156
+ return 0
157
+
158
+ def extract_answer(text, valid_labels):
159
+ """Extract first occurring valid label from generated text"""
160
+ # Create regex pattern that matches any of the valid labels
161
+ pattern = r'\b(' + '|'.join(valid_labels) + r')\b'
162
+ match = re.search(pattern, text, re.IGNORECASE)
163
+ return match.group(1).upper() if match else None
164
+
165
+ # Evaluate on both mc1 and mc2 targets
166
+ mc1_scores = []
167
+ mc2_scores = []
168
+
169
+ for example in ds2:
170
+ mc1_scores.append(evaluate_mc(example, "mc1_targets"))
171
+ mc2_scores.append(evaluate_mc(example, "mc2_targets"))
172
+
173
+ # Calculate metrics
174
+ def calculate_metrics(scores):
175
+ total = len(scores)
176
+ correct = sum(scores)
177
+ accuracy = correct / total if total > 0 else 0
178
+ return total, correct, accuracy
179
+
180
+ mc1_total, mc1_correct, mc1_accuracy = calculate_metrics(mc1_scores)
181
+ mc2_total, mc2_correct, mc2_accuracy = calculate_metrics(mc2_scores)
182
+
183
+ # Print results
184
+ print("\nBenchmark Results:")
185
+ print(f"MC1 Targets:")
186
+ print(f"Total questions: {mc1_total}")
187
+ print(f"Correct answers: {mc1_correct}")
188
+ print(f"Accuracy: {mc1_accuracy:.2%}")
189
+ print(f"\nMC2 Targets:")
190
+ print(f"Total questions: {mc2_total}")
191
+ print(f"Correct answers: {mc2_correct}")
192
+ print(f"Accuracy: {mc2_accuracy:.2%}")
193
+
194
+ #output
195
+ #MC1 Targets:
196
+ #Total questions: 817
197
+ #Correct answers: 355
198
+ #Accuracy: 43.45%
199
+
200
+ #MC2 Targets:
201
+ #Total questions: 817
202
+ #Correct answers: 181
203
+ #Accuracy: 22.15