Spaces:
Paused
Paused
KuangDW
commited on
Commit
·
8dfab00
1
Parent(s):
2e5836c
add embed.sh and cython file
Browse files- laser/.gitignore +0 -1
- laser/tasks/embed/README.md +44 -0
- laser/tasks/embed/embed.sh +79 -0
- vecalign/.gitignore +1 -2
laser/.gitignore
CHANGED
|
@@ -3,7 +3,6 @@ source/lib/__pycache__
|
|
| 3 |
models
|
| 4 |
tools-external
|
| 5 |
tasks/mldoc/MLDoc
|
| 6 |
-
embed
|
| 7 |
tasks/bucc/downloaded
|
| 8 |
tasks/similarity/dev/
|
| 9 |
tasks/xnli/XNLI-1.0*
|
|
|
|
| 3 |
models
|
| 4 |
tools-external
|
| 5 |
tasks/mldoc/MLDoc
|
|
|
|
| 6 |
tasks/bucc/downloaded
|
| 7 |
tasks/similarity/dev/
|
| 8 |
tasks/xnli/XNLI-1.0*
|
laser/tasks/embed/README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# LASER: calculation of sentence embeddings
|
| 2 |
+
|
| 3 |
+
Tool to calculate sentence embeddings for an arbitrary text file:
|
| 4 |
+
```
|
| 5 |
+
bash ./embed.sh INPUT-FILE OUTPUT-FILE [LANGUAGE]
|
| 6 |
+
```
|
| 7 |
+
|
| 8 |
+
The input will first be tokenized, and then sentence embeddings will be generated. If a `language` is specified,
|
| 9 |
+
then `embed.sh` will look for a language-specific LASER3 encoder using the format: `{model_dir}/laser3-{language}.{version}.pt`.
|
| 10 |
+
Otherwise it will default to LASER2 which covers the same 93 languages as [the original LASER encoder](https://arxiv.org/pdf/1812.10464.pdf).
|
| 11 |
+
|
| 12 |
+
**NOTE:** please set the model location (`model_dir` in `embed.sh`) before running. We recommend to download the models from the NLLB
|
| 13 |
+
release (see [here](/nllb/README.md)). Optionally you can also select the model version number for downloaded LASER3 models. This currently defaults to: `1` (initial release).
|
| 14 |
+
|
| 15 |
+
## Output format
|
| 16 |
+
|
| 17 |
+
The embeddings are stored in float32 matrices in raw binary format.
|
| 18 |
+
They can be read in Python by:
|
| 19 |
+
```
|
| 20 |
+
import numpy as np
|
| 21 |
+
dim = 1024
|
| 22 |
+
X = np.fromfile("my_embeddings.bin", dtype=np.float32, count=-1)
|
| 23 |
+
X.resize(X.shape[0] // dim, dim)
|
| 24 |
+
```
|
| 25 |
+
X is a N x 1024 matrix where N is the number of lines in the text file.
|
| 26 |
+
|
| 27 |
+
## Examples
|
| 28 |
+
|
| 29 |
+
In order to encode an input text in any of the 93 languages supported by LASER2 (e.g. Afrikaans, English, French):
|
| 30 |
+
```
|
| 31 |
+
./embed.sh input_file output_file
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
To use a language-specific encoder (if available), such as for example: Wolof, Hausa, or Irish:
|
| 35 |
+
```
|
| 36 |
+
./embed.sh input_file output_file wol_Latn
|
| 37 |
+
```
|
| 38 |
+
```
|
| 39 |
+
./embed.sh input_file output_file hau_Latn
|
| 40 |
+
```
|
| 41 |
+
```
|
| 42 |
+
./embed.sh input_file output_file gle_Latn
|
| 43 |
+
```
|
| 44 |
+
|
laser/tasks/embed/embed.sh
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the BSD-style license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
#
|
| 8 |
+
# LASER Language-Agnostic SEntence Representations
|
| 9 |
+
# is a toolkit to calculate multilingual sentence embeddings
|
| 10 |
+
# and to use them for document classification, bitext filtering
|
| 11 |
+
# and mining
|
| 12 |
+
#
|
| 13 |
+
# --------------------------------------------------------
|
| 14 |
+
#
|
| 15 |
+
# bash script to calculate sentence embeddings for arbitrary
|
| 16 |
+
# text file
|
| 17 |
+
|
| 18 |
+
#############################
|
| 19 |
+
# BEGIN PARAMETERS TO SET
|
| 20 |
+
#############################
|
| 21 |
+
# location of models (e.g. /path/to/models); no trailing slash
|
| 22 |
+
model_dir="laser"
|
| 23 |
+
|
| 24 |
+
# version number for LASER3 models
|
| 25 |
+
version=1
|
| 26 |
+
#############################
|
| 27 |
+
# END PARAMETERS TO SET
|
| 28 |
+
#############################
|
| 29 |
+
|
| 30 |
+
if [ -z ${model_dir} ]; then
|
| 31 |
+
echo "Please set model directory within script"
|
| 32 |
+
exit 1
|
| 33 |
+
elif [ ! -d ${model_dir} ]; then
|
| 34 |
+
echo "Can't find model directory: $model_dir"
|
| 35 |
+
exit 1
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
if [ -z ${LASER} ] ; then
|
| 39 |
+
echo "Please set the environment variable 'LASER'"
|
| 40 |
+
exit 1
|
| 41 |
+
fi
|
| 42 |
+
|
| 43 |
+
if [ $# -lt 2 ] ; then
|
| 44 |
+
echo "usage: embed.sh input-file output-file [language]"
|
| 45 |
+
exit 1
|
| 46 |
+
fi
|
| 47 |
+
|
| 48 |
+
infile=$1
|
| 49 |
+
outfile=$2
|
| 50 |
+
language=$3
|
| 51 |
+
|
| 52 |
+
# default to laser2
|
| 53 |
+
model_file=${model_dir}/laser2.pt
|
| 54 |
+
spm=${model_dir}/laser2.spm
|
| 55 |
+
|
| 56 |
+
if [ ! -z ${language} ]; then
|
| 57 |
+
model_file=${model_dir}/laser3-$language.v$version.pt
|
| 58 |
+
lang_specific_spm=${model_dir}/laser3-$language.v$version.spm
|
| 59 |
+
if [[ -s $lang_specific_spm ]]; then
|
| 60 |
+
spm=$lang_specific_spm
|
| 61 |
+
fi
|
| 62 |
+
fi
|
| 63 |
+
|
| 64 |
+
if [[ ! -s $model_file ]]; then
|
| 65 |
+
echo "couldn't find model file: $model_file"
|
| 66 |
+
exit 1
|
| 67 |
+
fi
|
| 68 |
+
|
| 69 |
+
if [[ ! -s $spm ]]; then
|
| 70 |
+
echo "couldn't find spm: $spm"
|
| 71 |
+
exit 1
|
| 72 |
+
fi
|
| 73 |
+
|
| 74 |
+
python3 ${LASER}/source/embed.py \
|
| 75 |
+
--input ${infile} \
|
| 76 |
+
--encoder ${model_file} \
|
| 77 |
+
--spm-model $spm \
|
| 78 |
+
--output ${outfile} \
|
| 79 |
+
--verbose
|
vecalign/.gitignore
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
build/
|
| 2 |
-
dp_core.c*
|
| 3 |
dp_core.html
|
| 4 |
__pycache__/
|
| 5 |
.idea
|
|
@@ -7,4 +6,4 @@ __pycache__/
|
|
| 7 |
.pytest_cache/
|
| 8 |
venv/
|
| 9 |
fairseq/
|
| 10 |
-
scores/
|
|
|
|
| 1 |
build/
|
|
|
|
| 2 |
dp_core.html
|
| 3 |
__pycache__/
|
| 4 |
.idea
|
|
|
|
| 6 |
.pytest_cache/
|
| 7 |
venv/
|
| 8 |
fairseq/
|
| 9 |
+
scores/
|