Ackerman-2k24 commited on
Commit
39ac250
ยท
verified ยท
1 Parent(s): 59ef469

Upload 8 files

Browse files
README.md ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: multilingual
3
+ widget:
4
+ - text: "๐Ÿค—"
5
+ - text: "T'estimo! โค๏ธ"
6
+ - text: "I love you!"
7
+ - text: "I hate you ๐Ÿคฎ"
8
+ - text: "Mahal kita!"
9
+ - text: "์‚ฌ๋ž‘ํ•ด!"
10
+ - text: "๋‚œ ๋„ˆ๊ฐ€ ์‹ซ์–ด"
11
+ - text: "๐Ÿ˜๐Ÿ˜๐Ÿ˜"
12
+ ---
13
+
14
+
15
+ # twitter-XLM-roBERTa-base for Sentiment Analysis
16
+
17
+ This is a multilingual XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis. The sentiment fine-tuning was done on 8 languages (Ar, En, Fr, De, Hi, It, Sp, Pt) but it can be used for more languages (see paper for details).
18
+
19
+ - Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://arxiv.org/abs/2104.12250).
20
+ - Git Repo: [XLM-T official repository](https://github.com/cardiffnlp/xlm-t).
21
+
22
+ This model has been integrated into the [TweetNLP library](https://github.com/cardiffnlp/tweetnlp).
23
+
24
+ ## Example Pipeline
25
+ ```python
26
+ from transformers import pipeline
27
+ model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
28
+ sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
29
+ sentiment_task("T'estimo!")
30
+ ```
31
+ ```
32
+ [{'label': 'Positive', 'score': 0.6600581407546997}]
33
+ ```
34
+
35
+ ## Full classification example
36
+
37
+ ```python
38
+ from transformers import AutoModelForSequenceClassification
39
+ from transformers import TFAutoModelForSequenceClassification
40
+ from transformers import AutoTokenizer, AutoConfig
41
+ import numpy as np
42
+ from scipy.special import softmax
43
+
44
+ # Preprocess text (username and link placeholders)
45
+ def preprocess(text):
46
+ new_text = []
47
+ for t in text.split(" "):
48
+ t = '@user' if t.startswith('@') and len(t) > 1 else t
49
+ t = 'http' if t.startswith('http') else t
50
+ new_text.append(t)
51
+ return " ".join(new_text)
52
+
53
+ MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
54
+
55
+ tokenizer = AutoTokenizer.from_pretrained(MODEL)
56
+ config = AutoConfig.from_pretrained(MODEL)
57
+
58
+ # PT
59
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL)
60
+ model.save_pretrained(MODEL)
61
+
62
+ text = "Good night ๐Ÿ˜Š"
63
+ text = preprocess(text)
64
+ encoded_input = tokenizer(text, return_tensors='pt')
65
+ output = model(**encoded_input)
66
+ scores = output[0][0].detach().numpy()
67
+ scores = softmax(scores)
68
+
69
+ # # TF
70
+ # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
71
+ # model.save_pretrained(MODEL)
72
+
73
+ # text = "Good night ๐Ÿ˜Š"
74
+ # encoded_input = tokenizer(text, return_tensors='tf')
75
+ # output = model(encoded_input)
76
+ # scores = output[0][0].numpy()
77
+ # scores = softmax(scores)
78
+
79
+ # Print labels and scores
80
+ ranking = np.argsort(scores)
81
+ ranking = ranking[::-1]
82
+ for i in range(scores.shape[0]):
83
+ l = config.id2label[ranking[i]]
84
+ s = scores[ranking[i]]
85
+ print(f"{i+1}) {l} {np.round(float(s), 4)}")
86
+
87
+ ```
88
+
89
+ Output:
90
+
91
+ ```
92
+ 1) Positive 0.7673
93
+ 2) Neutral 0.2015
94
+ 3) Negative 0.0313
95
+ ```
96
+
97
+ ### Reference
98
+ ```
99
+ @inproceedings{barbieri-etal-2022-xlm,
100
+ title = "{XLM}-{T}: Multilingual Language Models in {T}witter for Sentiment Analysis and Beyond",
101
+ author = "Barbieri, Francesco and
102
+ Espinosa Anke, Luis and
103
+ Camacho-Collados, Jose",
104
+ booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
105
+ month = jun,
106
+ year = "2022",
107
+ address = "Marseille, France",
108
+ publisher = "European Language Resources Association",
109
+ url = "https://aclanthology.org/2022.lrec-1.27",
110
+ pages = "258--266"
111
+ }
112
+
113
+ ```
114
+
config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/local-twitter-xlm-roberta-base-sentiment/",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 2,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "negative",
15
+ "1": "neutral",
16
+ "2": "positive"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "negative": 0,
22
+ "neutral": 1,
23
+ "positive": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 12,
30
+ "output_past": true,
31
+ "pad_token_id": 1,
32
+ "type_vocab_size": 1,
33
+ "vocab_size": 250002
34
+ }
gitattributes ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
2
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.h5 filter=lfs diff=lfs merge=lfs -text
5
+ *.tflite filter=lfs diff=lfs merge=lfs -text
6
+ *.tar.gz filter=lfs diff=lfs merge=lfs -text
7
+ *.ot filter=lfs diff=lfs merge=lfs -text
8
+ *.onnx filter=lfs diff=lfs merge=lfs -text
9
+ *.arrow filter=lfs diff=lfs merge=lfs -text
10
+ *.ftz filter=lfs diff=lfs merge=lfs -text
11
+ *.joblib filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.pb filter=lfs diff=lfs merge=lfs -text
15
+ *.pt filter=lfs diff=lfs merge=lfs -text
16
+ *.pth filter=lfs diff=lfs merge=lfs -text
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93316a86051c359748c5d5453e7660c69a21a57cfb477892f95f539e3e171196
3
+ size 1112271561
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
test1.ipynb ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "b7da4b7f-babc-47ab-82bb-cfd31c2531e0",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig\n",
11
+ "from scipy.special import softmax\n"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "2e109896-aba6-464b-805b-2f87dc4f61c4",
18
+ "metadata": {},
19
+ "outputs": [
20
+ {
21
+ "name": "stdout",
22
+ "output_type": "stream",
23
+ "text": [
24
+ "<class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>\n"
25
+ ]
26
+ }
27
+ ],
28
+ "source": [
29
+ "import sentencepiece\n",
30
+ "from transformers import AutoTokenizer\n",
31
+ "\n",
32
+ "tokenizer = AutoTokenizer.from_pretrained(\n",
33
+ " r\"C:\\Users\\frann\\Downloads\\sentiment-analysis\"\n",
34
+ ")\n",
35
+ "print(type(tokenizer))\n"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 12,
41
+ "id": "46f22dfc-a6b1-4fa1-9bee-ad1718fd8108",
42
+ "metadata": {},
43
+ "outputs": [
44
+ {
45
+ "name": "stdout",
46
+ "output_type": "stream",
47
+ "text": [
48
+ "positive: 0.9313\n",
49
+ "neutral: 0.0408\n",
50
+ "negative: 0.0279\n"
51
+ ]
52
+ }
53
+ ],
54
+ "source": [
55
+ "from transformers import AutoModelForSequenceClassification, AutoConfig\n",
56
+ "from scipy.special import softmax\n",
57
+ "\n",
58
+ "local = r\"C:\\Users\\frann\\Downloads\\sentiment-analysis\"\n",
59
+ "tokenizer = AutoTokenizer.from_pretrained(local, use_fast=False)\n",
60
+ "config = AutoConfig.from_pretrained(local)\n",
61
+ "model = AutoModelForSequenceClassification.from_pretrained(local)\n",
62
+ "\n",
63
+ "text = \"j'ai รฉtรฉ trรจs รฉmu \"\n",
64
+ "inputs = tokenizer(text, return_tensors=\"pt\")\n",
65
+ "outputs = model(**inputs)\n",
66
+ "scores = softmax(outputs.logits.detach().numpy()[0])\n",
67
+ "\n",
68
+ "for idx in scores.argsort()[::-1]:\n",
69
+ " print(f\"{config.id2label[idx]}: {scores[idx]:.4f}\")\n"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": null,
75
+ "id": "e4a1bb6a-4c8f-40df-97f9-69f2506e5b6f",
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": []
79
+ }
80
+ ],
81
+ "metadata": {
82
+ "kernelspec": {
83
+ "display_name": "Python 3 (ipykernel)",
84
+ "language": "python",
85
+ "name": "python3"
86
+ },
87
+ "language_info": {
88
+ "codemirror_mode": {
89
+ "name": "ipython",
90
+ "version": 3
91
+ },
92
+ "file_extension": ".py",
93
+ "mimetype": "text/x-python",
94
+ "name": "python",
95
+ "nbconvert_exporter": "python",
96
+ "pygments_lexer": "ipython3",
97
+ "version": "3.11.7"
98
+ }
99
+ },
100
+ "nbformat": 4,
101
+ "nbformat_minor": 5
102
+ }
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52c751d49a0c68a9d14ef218053b1baee3d5713dc6d22ca7ec05486f9337cdf1
3
+ size 1114822968