Upload 8 files
Browse files- README.md +114 -0
- config.json +34 -0
- gitattributes +16 -0
- pytorch_model.bin +3 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +1 -0
- test1.ipynb +102 -0
- tf_model.h5 +3 -0
README.md
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: multilingual
|
| 3 |
+
widget:
|
| 4 |
+
- text: "๐ค"
|
| 5 |
+
- text: "T'estimo! โค๏ธ"
|
| 6 |
+
- text: "I love you!"
|
| 7 |
+
- text: "I hate you ๐คฎ"
|
| 8 |
+
- text: "Mahal kita!"
|
| 9 |
+
- text: "์ฌ๋ํด!"
|
| 10 |
+
- text: "๋ ๋๊ฐ ์ซ์ด"
|
| 11 |
+
- text: "๐๐๐"
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
# twitter-XLM-roBERTa-base for Sentiment Analysis
|
| 16 |
+
|
| 17 |
+
This is a multilingual XLM-roBERTa-base model trained on ~198M tweets and finetuned for sentiment analysis. The sentiment fine-tuning was done on 8 languages (Ar, En, Fr, De, Hi, It, Sp, Pt) but it can be used for more languages (see paper for details).
|
| 18 |
+
|
| 19 |
+
- Paper: [XLM-T: A Multilingual Language Model Toolkit for Twitter](https://arxiv.org/abs/2104.12250).
|
| 20 |
+
- Git Repo: [XLM-T official repository](https://github.com/cardiffnlp/xlm-t).
|
| 21 |
+
|
| 22 |
+
This model has been integrated into the [TweetNLP library](https://github.com/cardiffnlp/tweetnlp).
|
| 23 |
+
|
| 24 |
+
## Example Pipeline
|
| 25 |
+
```python
|
| 26 |
+
from transformers import pipeline
|
| 27 |
+
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
|
| 28 |
+
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)
|
| 29 |
+
sentiment_task("T'estimo!")
|
| 30 |
+
```
|
| 31 |
+
```
|
| 32 |
+
[{'label': 'Positive', 'score': 0.6600581407546997}]
|
| 33 |
+
```
|
| 34 |
+
|
| 35 |
+
## Full classification example
|
| 36 |
+
|
| 37 |
+
```python
|
| 38 |
+
from transformers import AutoModelForSequenceClassification
|
| 39 |
+
from transformers import TFAutoModelForSequenceClassification
|
| 40 |
+
from transformers import AutoTokenizer, AutoConfig
|
| 41 |
+
import numpy as np
|
| 42 |
+
from scipy.special import softmax
|
| 43 |
+
|
| 44 |
+
# Preprocess text (username and link placeholders)
|
| 45 |
+
def preprocess(text):
|
| 46 |
+
new_text = []
|
| 47 |
+
for t in text.split(" "):
|
| 48 |
+
t = '@user' if t.startswith('@') and len(t) > 1 else t
|
| 49 |
+
t = 'http' if t.startswith('http') else t
|
| 50 |
+
new_text.append(t)
|
| 51 |
+
return " ".join(new_text)
|
| 52 |
+
|
| 53 |
+
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
|
| 54 |
+
|
| 55 |
+
tokenizer = AutoTokenizer.from_pretrained(MODEL)
|
| 56 |
+
config = AutoConfig.from_pretrained(MODEL)
|
| 57 |
+
|
| 58 |
+
# PT
|
| 59 |
+
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
|
| 60 |
+
model.save_pretrained(MODEL)
|
| 61 |
+
|
| 62 |
+
text = "Good night ๐"
|
| 63 |
+
text = preprocess(text)
|
| 64 |
+
encoded_input = tokenizer(text, return_tensors='pt')
|
| 65 |
+
output = model(**encoded_input)
|
| 66 |
+
scores = output[0][0].detach().numpy()
|
| 67 |
+
scores = softmax(scores)
|
| 68 |
+
|
| 69 |
+
# # TF
|
| 70 |
+
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
|
| 71 |
+
# model.save_pretrained(MODEL)
|
| 72 |
+
|
| 73 |
+
# text = "Good night ๐"
|
| 74 |
+
# encoded_input = tokenizer(text, return_tensors='tf')
|
| 75 |
+
# output = model(encoded_input)
|
| 76 |
+
# scores = output[0][0].numpy()
|
| 77 |
+
# scores = softmax(scores)
|
| 78 |
+
|
| 79 |
+
# Print labels and scores
|
| 80 |
+
ranking = np.argsort(scores)
|
| 81 |
+
ranking = ranking[::-1]
|
| 82 |
+
for i in range(scores.shape[0]):
|
| 83 |
+
l = config.id2label[ranking[i]]
|
| 84 |
+
s = scores[ranking[i]]
|
| 85 |
+
print(f"{i+1}) {l} {np.round(float(s), 4)}")
|
| 86 |
+
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
Output:
|
| 90 |
+
|
| 91 |
+
```
|
| 92 |
+
1) Positive 0.7673
|
| 93 |
+
2) Neutral 0.2015
|
| 94 |
+
3) Negative 0.0313
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
### Reference
|
| 98 |
+
```
|
| 99 |
+
@inproceedings{barbieri-etal-2022-xlm,
|
| 100 |
+
title = "{XLM}-{T}: Multilingual Language Models in {T}witter for Sentiment Analysis and Beyond",
|
| 101 |
+
author = "Barbieri, Francesco and
|
| 102 |
+
Espinosa Anke, Luis and
|
| 103 |
+
Camacho-Collados, Jose",
|
| 104 |
+
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
|
| 105 |
+
month = jun,
|
| 106 |
+
year = "2022",
|
| 107 |
+
address = "Marseille, France",
|
| 108 |
+
publisher = "European Language Resources Association",
|
| 109 |
+
url = "https://aclanthology.org/2022.lrec-1.27",
|
| 110 |
+
pages = "258--266"
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
```
|
| 114 |
+
|
config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "/home/jupyter/misc/tweeteval/TweetEval_models/xlm-twitter/local-twitter-xlm-roberta-base-sentiment/",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"XLMRobertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"eos_token_id": 2,
|
| 9 |
+
"gradient_checkpointing": false,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_dropout_prob": 0.1,
|
| 12 |
+
"hidden_size": 768,
|
| 13 |
+
"id2label": {
|
| 14 |
+
"0": "negative",
|
| 15 |
+
"1": "neutral",
|
| 16 |
+
"2": "positive"
|
| 17 |
+
},
|
| 18 |
+
"initializer_range": 0.02,
|
| 19 |
+
"intermediate_size": 3072,
|
| 20 |
+
"label2id": {
|
| 21 |
+
"negative": 0,
|
| 22 |
+
"neutral": 1,
|
| 23 |
+
"positive": 2
|
| 24 |
+
},
|
| 25 |
+
"layer_norm_eps": 1e-05,
|
| 26 |
+
"max_position_embeddings": 514,
|
| 27 |
+
"model_type": "xlm-roberta",
|
| 28 |
+
"num_attention_heads": 12,
|
| 29 |
+
"num_hidden_layers": 12,
|
| 30 |
+
"output_past": true,
|
| 31 |
+
"pad_token_id": 1,
|
| 32 |
+
"type_vocab_size": 1,
|
| 33 |
+
"vocab_size": 250002
|
| 34 |
+
}
|
gitattributes
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:93316a86051c359748c5d5453e7660c69a21a57cfb477892f95f539e3e171196
|
| 3 |
+
size 1112271561
|
sentencepiece.bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
| 3 |
+
size 5069051
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"bos_token": "<s>", "eos_token": "</s>", "unk_token": "<unk>", "sep_token": "</s>", "pad_token": "<pad>", "cls_token": "<s>", "mask_token": "<mask>"}
|
test1.ipynb
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "b7da4b7f-babc-47ab-82bb-cfd31c2531e0",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoConfig\n",
|
| 11 |
+
"from scipy.special import softmax\n"
|
| 12 |
+
]
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"cell_type": "code",
|
| 16 |
+
"execution_count": 2,
|
| 17 |
+
"id": "2e109896-aba6-464b-805b-2f87dc4f61c4",
|
| 18 |
+
"metadata": {},
|
| 19 |
+
"outputs": [
|
| 20 |
+
{
|
| 21 |
+
"name": "stdout",
|
| 22 |
+
"output_type": "stream",
|
| 23 |
+
"text": [
|
| 24 |
+
"<class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>\n"
|
| 25 |
+
]
|
| 26 |
+
}
|
| 27 |
+
],
|
| 28 |
+
"source": [
|
| 29 |
+
"import sentencepiece\n",
|
| 30 |
+
"from transformers import AutoTokenizer\n",
|
| 31 |
+
"\n",
|
| 32 |
+
"tokenizer = AutoTokenizer.from_pretrained(\n",
|
| 33 |
+
" r\"C:\\Users\\frann\\Downloads\\sentiment-analysis\"\n",
|
| 34 |
+
")\n",
|
| 35 |
+
"print(type(tokenizer))\n"
|
| 36 |
+
]
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"cell_type": "code",
|
| 40 |
+
"execution_count": 12,
|
| 41 |
+
"id": "46f22dfc-a6b1-4fa1-9bee-ad1718fd8108",
|
| 42 |
+
"metadata": {},
|
| 43 |
+
"outputs": [
|
| 44 |
+
{
|
| 45 |
+
"name": "stdout",
|
| 46 |
+
"output_type": "stream",
|
| 47 |
+
"text": [
|
| 48 |
+
"positive: 0.9313\n",
|
| 49 |
+
"neutral: 0.0408\n",
|
| 50 |
+
"negative: 0.0279\n"
|
| 51 |
+
]
|
| 52 |
+
}
|
| 53 |
+
],
|
| 54 |
+
"source": [
|
| 55 |
+
"from transformers import AutoModelForSequenceClassification, AutoConfig\n",
|
| 56 |
+
"from scipy.special import softmax\n",
|
| 57 |
+
"\n",
|
| 58 |
+
"local = r\"C:\\Users\\frann\\Downloads\\sentiment-analysis\"\n",
|
| 59 |
+
"tokenizer = AutoTokenizer.from_pretrained(local, use_fast=False)\n",
|
| 60 |
+
"config = AutoConfig.from_pretrained(local)\n",
|
| 61 |
+
"model = AutoModelForSequenceClassification.from_pretrained(local)\n",
|
| 62 |
+
"\n",
|
| 63 |
+
"text = \"j'ai รฉtรฉ trรจs รฉmu \"\n",
|
| 64 |
+
"inputs = tokenizer(text, return_tensors=\"pt\")\n",
|
| 65 |
+
"outputs = model(**inputs)\n",
|
| 66 |
+
"scores = softmax(outputs.logits.detach().numpy()[0])\n",
|
| 67 |
+
"\n",
|
| 68 |
+
"for idx in scores.argsort()[::-1]:\n",
|
| 69 |
+
" print(f\"{config.id2label[idx]}: {scores[idx]:.4f}\")\n"
|
| 70 |
+
]
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
"cell_type": "code",
|
| 74 |
+
"execution_count": null,
|
| 75 |
+
"id": "e4a1bb6a-4c8f-40df-97f9-69f2506e5b6f",
|
| 76 |
+
"metadata": {},
|
| 77 |
+
"outputs": [],
|
| 78 |
+
"source": []
|
| 79 |
+
}
|
| 80 |
+
],
|
| 81 |
+
"metadata": {
|
| 82 |
+
"kernelspec": {
|
| 83 |
+
"display_name": "Python 3 (ipykernel)",
|
| 84 |
+
"language": "python",
|
| 85 |
+
"name": "python3"
|
| 86 |
+
},
|
| 87 |
+
"language_info": {
|
| 88 |
+
"codemirror_mode": {
|
| 89 |
+
"name": "ipython",
|
| 90 |
+
"version": 3
|
| 91 |
+
},
|
| 92 |
+
"file_extension": ".py",
|
| 93 |
+
"mimetype": "text/x-python",
|
| 94 |
+
"name": "python",
|
| 95 |
+
"nbconvert_exporter": "python",
|
| 96 |
+
"pygments_lexer": "ipython3",
|
| 97 |
+
"version": "3.11.7"
|
| 98 |
+
}
|
| 99 |
+
},
|
| 100 |
+
"nbformat": 4,
|
| 101 |
+
"nbformat_minor": 5
|
| 102 |
+
}
|
tf_model.h5
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52c751d49a0c68a9d14ef218053b1baee3d5713dc6d22ca7ec05486f9337cdf1
|
| 3 |
+
size 1114822968
|