SayedShaun commited on
Commit
3e68a39
·
verified ·
1 Parent(s): 153e78d

Create finetune.py

Browse files
Files changed (1) hide show
  1. finetune.py +116 -0
finetune.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, Dataset
2
+ import random
3
+ import numpy as np
4
+ from transformers import (
5
+ AutoTokenizer,
6
+ DataCollatorWithPadding,
7
+ AutoModelForSequenceClassification,
8
+ TrainingArguments,
9
+ Trainer,
10
+ PreTrainedTokenizer,
11
+ ElectraForSequenceClassification,
12
+ EarlyStoppingCallback
13
+ )
14
+ from dataclasses import dataclass
15
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
16
+
17
+ def label_mapper(label: str) -> int:
18
+ if label == "SP":
19
+ return 0
20
+ elif label == "WP":
21
+ return 1
22
+ elif label == "WN":
23
+ return 2
24
+ elif label == "SN":
25
+ return 3
26
+ elif label == "NU":
27
+ return 4
28
+ else:
29
+ raise ValueError(f"Invalid label: {label}")
30
+
31
+
32
+ def process(batch: dict, tokenizer: PreTrainedTokenizer) -> dict:
33
+ new_labels = [label_mapper(label) for label in batch["Polarity"]]
34
+ inputs = tokenizer(batch["Text"], truncation=True)
35
+ batch["input_ids"] = inputs["input_ids"]
36
+ batch["attention_mask"] = inputs["attention_mask"]
37
+ batch["labels"] = new_labels
38
+ return batch
39
+
40
+
41
+ def compute_metrics(eval_pred):
42
+ logits, labels = eval_pred
43
+ predictions = logits.argmax(-1)
44
+ accuracy = accuracy_score(labels, predictions)
45
+ precision, recall, f1, _ = precision_recall_fscore_support(
46
+ labels, predictions, average="macro"
47
+ )
48
+ return {
49
+ "accuracy": accuracy,
50
+ "precision": precision,
51
+ "recall": recall,
52
+ "f1": f1,
53
+ }
54
+
55
+
56
+ def pipeline(args):
57
+ model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=5)
58
+ tokenizer = AutoTokenizer.from_pretrained(args.model_name)
59
+ dataset = load_dataset(args.dataset_name)
60
+ dataset = dataset.map(process, batched=True, fn_kwargs={'tokenizer': tokenizer})
61
+ dataset = dataset["train"].train_test_split(args.split_ratio)
62
+ train_dataset = dataset["train"]
63
+ test_dataset = dataset["test"]
64
+ data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
65
+
66
+ trainer = Trainer(
67
+ model=model,
68
+ args=TrainingArguments(
69
+ output_dir="./results",
70
+ learning_rate=args.learning_rate,
71
+ per_device_train_batch_size=args.batch_size,
72
+ per_device_eval_batch_size=args.batch_size,
73
+ num_train_epochs=args.epochs,
74
+ weight_decay=0.01,
75
+ eval_strategy="steps",
76
+ save_strategy="steps",
77
+ load_best_model_at_end=True,
78
+ report_to="none",
79
+ save_steps=500,
80
+ eval_steps=500,
81
+ save_total_limit=1,
82
+ logging_steps=500,
83
+ fp16=args.fp16,
84
+ greater_is_better=True,
85
+ metric_for_best_model="f1",
86
+ ),
87
+ train_dataset=train_dataset,
88
+ eval_dataset=test_dataset,
89
+ processing_class=tokenizer,
90
+ data_collator=data_collator,
91
+ compute_metrics=compute_metrics,
92
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
93
+ )
94
+
95
+ trainer.train()
96
+ trainer.evaluate()
97
+ trainer.predict(test_dataset)
98
+
99
+ # Push to Hub
100
+ tokenizer.push_to_hub(args.hub_location)
101
+ model.push_to_hub(args.hub_location)
102
+
103
+ @dataclass
104
+ class Arguments:
105
+ model_name: str = "csebuetnlp/banglabert"
106
+ dataset_name: str = "SayedShaun/sentigold"
107
+ split_ratio: float = 0.1
108
+ batch_size: int = 128
109
+ epochs: int = 40
110
+ learning_rate: float = 1e-5
111
+ fp16: bool = True
112
+ hub_location: str = "SayedShaun/bangla-classifier-multiclass"
113
+
114
+ if __name__=="__main__":
115
+ args = Arguments()
116
+ pipeline(args)