ionvop commited on
Commit
08b5f9a
·
verified ·
1 Parent(s): 2662d11

Upload 110 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +3 -0
  2. .env-docker +8 -0
  3. .gitignore +8 -0
  4. Dockerfile +41 -13
  5. api-request.sh +59 -0
  6. assets-download.sh +52 -0
  7. docker-run.sh +16 -0
  8. docs/es/README.es.md +171 -0
  9. docs/jp/README.ja.md +119 -0
  10. poetry.lock +0 -0
  11. pyproject.toml +44 -0
  12. rvc/__init__.py +0 -0
  13. rvc/configs/__init__.py +1 -0
  14. rvc/configs/config.py +197 -0
  15. rvc/configs/v1/32k.json +46 -0
  16. rvc/configs/v1/40k.json +46 -0
  17. rvc/configs/v1/48k.json +46 -0
  18. rvc/configs/v2/32k.json +46 -0
  19. rvc/configs/v2/48k.json +46 -0
  20. rvc/lib/audio.py +70 -0
  21. rvc/lib/infer_pack/attentions.py +459 -0
  22. rvc/lib/infer_pack/commons.py +172 -0
  23. rvc/lib/infer_pack/models.py +1426 -0
  24. rvc/lib/infer_pack/models_onnx.py +821 -0
  25. rvc/lib/infer_pack/modules.py +615 -0
  26. rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +91 -0
  27. rvc/lib/infer_pack/modules/F0Predictor/F0Predictor.py +16 -0
  28. rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +87 -0
  29. rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +98 -0
  30. rvc/lib/infer_pack/modules/F0Predictor/__init__.py +0 -0
  31. rvc/lib/infer_pack/onnx_inference.py +149 -0
  32. rvc/lib/infer_pack/transforms.py +207 -0
  33. rvc/lib/ipex/__init__.py +182 -0
  34. rvc/lib/ipex/attention.py +206 -0
  35. rvc/lib/ipex/gradscaler.py +184 -0
  36. rvc/lib/ipex/hijacks.py +352 -0
  37. rvc/lib/jit/__init__.py +164 -0
  38. rvc/lib/jit/get_hubert.py +343 -0
  39. rvc/lib/jit/get_rmvpe.py +12 -0
  40. rvc/lib/jit/get_synthesizer.py +37 -0
  41. rvc/lib/rmvpe.py +665 -0
  42. rvc/lib/slicer2.py +260 -0
  43. rvc/lib/train/architecture/v1.yml +57 -0
  44. rvc/lib/train/architecture/v2.yml +38 -0
  45. rvc/lib/train/data_utils.py +517 -0
  46. rvc/lib/train/losses.py +58 -0
  47. rvc/lib/train/mel_processing.py +133 -0
  48. rvc/lib/train/process_ckpt.py +260 -0
  49. rvc/lib/train/utils.py +478 -0
  50. rvc/lib/uvr5_pack/lib_v5/dataset.py +183 -0
.dockerignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ /.venv
2
+ dist
3
+ __pycache__/
.env-docker ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ weight_root=/weights
2
+ weight_uvr5_root=/assets/uvr5_weights
3
+ index_root=/indices
4
+ rmvpe_root=/assets/rmvpe
5
+ hubert_path=/assets/hubert_base.pt
6
+ save_uvr_path=/assets/uvr5_weights
7
+ TEMP=/app/TEMP
8
+ pretrained=/assets/pretrained
.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ /.venv
2
+ /assets
3
+ dist
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ .DS_Store
Dockerfile CHANGED
@@ -1,19 +1,47 @@
1
- FROM python:3.10-slim
2
 
3
- WORKDIR /app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # System deps
6
- RUN apt-get update && apt-get install -y ffmpeg git && rm -rf /var/lib/apt/lists/*
 
 
 
 
 
 
 
 
 
 
7
 
8
- # Python deps
9
- COPY requirements.txt .
10
- RUN pip install --no-cache-dir -r requirements.txt
11
 
12
- # Copy app + models
13
- COPY app/ ./app/
14
- COPY rvc/ ./rvc/
15
- COPY models/ ./models/
 
 
 
 
16
 
17
- EXPOSE 7860
 
18
 
19
- CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM alpine:3.19.1 as assets
2
 
3
+ RUN apk add \
4
+ --update \
5
+ --no-cache \
6
+ bash \
7
+ git \
8
+ git-lfs\
9
+ dos2unix
10
+
11
+ COPY --chmod=755 ./assets-download.sh /assets-download.sh
12
+
13
+ #convert malformed line endings if cloned from Windows
14
+ RUN dos2unix /assets-download.sh
15
+
16
+ RUN /assets-download.sh 88e42f0cb3662ddc0dd263a4814206ce96d53214 assets
17
+
18
+ FROM python:3.10.14-bullseye as app
19
 
20
+ SHELL [ "/bin/bash", "-c" ]
21
+
22
+ RUN apt update && \
23
+ apt install -y \
24
+ libsndfile1 \
25
+ libsndfile1-dev && \
26
+ apt clean && \
27
+ rm -rf /var/lib/apt/lists/*
28
+
29
+ COPY --from=assets /assets /assets
30
+
31
+ WORKDIR /app
32
 
33
+ COPY ./pyproject.toml .
 
 
34
 
35
+ RUN pip install \
36
+ --no-cache-dir \
37
+ "poetry==1.7.1" && \
38
+ poetry config virtualenvs.create false && \
39
+ poetry install \
40
+ --no-interaction \
41
+ --no-root && \
42
+ poetry cache clear --all .
43
 
44
+ COPY ./rvc ./rvc
45
+ COPY ./.env-docker ./.env
46
 
47
+ CMD [ "poetry", "run", "poe", "rvc-api" ]
api-request.sh ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Runs request to RVC API on localhost.
4
+
5
+ set -e
6
+
7
+ host="http://127.0.0.1:8000"
8
+
9
+ url="${host}/inference"
10
+
11
+ url+="?res_type=json"
12
+
13
+ model_path=""
14
+ index_path=""
15
+ input_audio=""
16
+ output_audio_suffix=""
17
+
18
+ while [ $# -gt 0 ]; do
19
+ if [ "$1" == "--model_path" ]; then
20
+ model_path="$2"
21
+ elif [ "$1" == "--index_file" ]; then
22
+ index_path="$2"
23
+ elif [ "$1" == "--input_audio" ]; then
24
+ input_audio="$2"
25
+ else
26
+ arg_name="${1#--}"
27
+ arg_value="$2"
28
+
29
+ url+="&${arg_name}=${arg_value}"
30
+ output_audio_suffix+="-${arg_name}_${arg_value}"
31
+ fi
32
+
33
+ shift
34
+ shift
35
+ done
36
+
37
+ model_path_base="$(basename "${model_path}")"
38
+ model_path_base_without_ext="${model_path_base%.*}"
39
+ index_path_base="$(basename "${index_path}")"
40
+ input_audio_base="$(basename "${input_audio}")"
41
+ input_audio_dirname="$(dirname "${input_audio}")"
42
+ output_audio_base_without_ext="${input_audio_base%.*}"
43
+ output_audio="${input_audio_dirname}/${output_audio_base_without_ext}-${model_path_base_without_ext}${output_audio_suffix}.wav"
44
+
45
+ cp "${model_path}" "./assets/weights/${model_path_base}"
46
+ cp "${input_audio}" "./assets/audios/${input_audio_base}"
47
+
48
+ if [ -f "${index_path}" ]; then
49
+ url+="&index_file=${index_path_base}"
50
+ cp "${index_path}" "./assets/indices/${index_path_base}"
51
+ fi
52
+
53
+ curl -X "POST" "${url}" \
54
+ -H "accept: application/json" \
55
+ -H "Content-Type: multipart/form-data" \
56
+ -F "modelpath=${model_path_base}" \
57
+ -F "input_audio=/audios/${input_audio_base}" \
58
+ | jq -r '.audio' \
59
+ | base64 -d > "${output_audio}"
assets-download.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Downloads required large files for RVC.
4
+
5
+ function download() {
6
+ local path="$1"
7
+ echo "Downloading ${path}"
8
+ git lfs pull --include="${path}"
9
+ }
10
+
11
+ set -e
12
+
13
+ REPO_FOLDER="VoiceConversionWebUI"
14
+
15
+ assets_commit_hash="$1"
16
+ assets_dir="$2"
17
+
18
+ export GIT_CLONE_PROTECTION_ACTIVE=false
19
+ export GIT_LFS_SKIP_SMUDGE=1
20
+
21
+ git clone https://huggingface.co/lj1995/VoiceConversionWebUI "${REPO_FOLDER}"
22
+
23
+ pushd "${REPO_FOLDER}"
24
+
25
+ git config advice.detachedHead false
26
+
27
+ git checkout "${assets_commit_hash}"
28
+
29
+ unset GIT_LFS_SKIP_SMUDGE
30
+ unset GIT_CLONE_PROTECTION_ACTIVE
31
+
32
+ download "hubert_base.pt"
33
+ download "pretrained"
34
+ download "uvr5_weights"
35
+ download "rmvpe.pt"
36
+ download "rmvpe.onnx"
37
+
38
+ rm -rf .git
39
+
40
+ popd
41
+
42
+ mkdir -p "${assets_dir}"
43
+
44
+ mv "${REPO_FOLDER}/hubert_base.pt" "${assets_dir}/hubert_base.pt"
45
+
46
+ mkdir -p "${assets_dir}/rmvpe"
47
+
48
+ mv "${REPO_FOLDER}/rmvpe.pt" "${assets_dir}/rmvpe/rmvpe.pt"
49
+ mv "${REPO_FOLDER}/rmvpe.onnx" "${assets_dir}/rmvpe/rmvpe.onnx"
50
+
51
+ mv "${REPO_FOLDER}/pretrained" "${assets_dir}/pretrained"
52
+ mv "${REPO_FOLDER}/uvr5_weights" "${assets_dir}/uvr5_weights"
docker-run.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #
3
+ # Runs RVC API in Docker.
4
+
5
+ set -e
6
+
7
+ tag="rvc"
8
+
9
+ docker build -t "${tag}" .
10
+
11
+ docker run -it \
12
+ -p 8000:8000 \
13
+ -v "${PWD}/assets/weights:/weights:ro" \
14
+ -v "${PWD}/assets/indices:/indices:ro" \
15
+ -v "${PWD}/assets/audios:/audios:ro" \
16
+ "${tag}"
docs/es/README.es.md ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <h1>Retrieval-based-Voice-Conversion</h1>
4
+ Un framework de conversión de voz basado en VITS y fácil de usar.<br><br>
5
+
6
+ [![madewithlove](https://img.shields.io/badge/hecho_con-%E2%9D%A4-red?style=for-the-badge&labelColor=orange
7
+ )](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion)
8
+
9
+ <img src="https://counter.seku.su/cmoe?name=rvc&theme=r34" /><br>
10
+
11
+ [![Licence](https://img.shields.io/github/license/RVC-Project/Retrieval-based-Voice-Conversion?style=for-the-badge)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion/blob/develop/LICENSE)
12
+
13
+ [![Discord](https://img.shields.io/badge/Desarrolladores%20de%20RVC-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk)
14
+
15
+ </div>
16
+
17
+ ------
18
+
19
+
20
+ > [!NOTE]
21
+ > Actualmente en desarrollo... Proporcionado como biblioteca y API en rvc
22
+
23
+ ## Instalación y uso
24
+
25
+ ### Instalación estándar
26
+
27
+ Primero, cree un directorio en su proyecto. La carpeta `assets` contendrá los modelos necesarios para la inferencia y el entrenamiento, y la carpeta `results` contendrá los resultados del entrenamiento.
28
+
29
+ ```sh
30
+ rvc init
31
+ ```
32
+ Esto creará la carpeta `assets` y `.env` en su directorio de trabajo.
33
+
34
+ > [!WARNING]
35
+ > El directorio debe de estar vacío o sin una carpeta de assets.
36
+
37
+ ### Instalación personalizada
38
+
39
+ Si ya has descargado modelos o deseas cambiar estas configuraciones, edita el archivo `.env`.
40
+ Si aún no tienes el archivo `.env`,
41
+
42
+ ```sh
43
+ rvc env create
44
+ ```
45
+ puedes crearlo.
46
+
47
+ Además, para descargar un modelo, puedes utilizar
48
+
49
+ ```sh
50
+ rvc dlmodel
51
+ ```
52
+ o
53
+ ```
54
+ rvc dlmodel {download_dir}
55
+ ```
56
+
57
+ Finalmente, especifique la ubicación del modelo en el archivo env y estará listo.
58
+
59
+
60
+
61
+ ### Uso de la librería
62
+
63
+ #### Inferir un audio
64
+ ```python
65
+ from pathlib import Path
66
+
67
+ from dotenv import load_dotenv
68
+ from scipy.io import wavfile
69
+
70
+ from rvc.modules.vc.modules import VC
71
+
72
+
73
+ def main():
74
+ vc = VC()
75
+ vc.get_vc("{model.pth}")
76
+ tgt_sr, audio_opt, times, _ = vc.vc_inference(
77
+ 1, Path("{InputAudio}")
78
+ )
79
+ wavfile.write("{OutputAudio}", tgt_sr, audio_opt)
80
+
81
+
82
+ if __name__ == "__main__":
83
+ load_dotenv("{envPath}")
84
+ main()
85
+
86
+ ```
87
+
88
+ ### Uso en CLI
89
+
90
+ #### Inferir un audio
91
+
92
+ ```sh
93
+ rvc infer -m {model.pth} -i {input.wav} -o {output.wav}
94
+ ```
95
+
96
+ | opción | flag&nbsp; | tipo | valor por defecto | descipción |
97
+ |---------------|------------|--------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
98
+ | modelPath | -m | Path | *requerido | Ruta del modelo o nombre de archivo (se lee en el directorio establecido en env) |
99
+ | inputPath | -i | Path | *requerido | Ruta o carpeta del audio de entrada |
100
+ | outputPath | -o | Path | *requerido | Ruta o carpeta del audio de salida |
101
+ | sid | -s | int | 0 | ID del Orador/ Cantante |
102
+ | f0_up_key | -fu | int | 0 | Transponer (número entero, número de semitonos, subir una octava: 12, bajar una octava: -12) |
103
+ | f0_method | -fm | str | rmvpe | Algoritmo de extracción de tono (pm, harvest, crepe, rmvpe) |
104
+ | f0_file | -ff | Path \| None | None | Archivo de curva F0 (opcional). Un tono por línea. Reemplaza el F0 predeterminado y la modulación de tono. |
105
+ | index_file | -if | Path \| None | None | Ruta al archivo index de características |
106
+ | index_rate | -if | float | 0.75 | Proporción de funciones de búsqueda (controla la fuerza del acento, demasiado alta tiene artifacting) |
107
+ | filter_radius | -fr | int | 3 | Si >=3: aplique el filtrado de mediana a los resultados del tono. El valor representa el radio del filtro y puede reducir la respiración |
108
+ | resample_sr | -rsr | int | 0 | Vuelva a muestrear el audio de salida en el posprocesamiento hasta la frecuencia de muestreo final. Establecer en 0 para no remuestreo |
109
+ | rms_mix_rate | -rmr | float | 0.25 | Ajuste la escala de la envolvente del volumen. Cuanto más cerca de 0, más imita el volumen de las voces originales. Puede ayudar a enmascarar el ruido y hacer que el volumen suene más natural cuando se establece en un nivel relativamente bajo. Más cerca de 1 habrá un volumen más alto y constante |
110
+ | protect | -p | float | 0.33 | Proteja las consonantes sordas y los sonidos respiratorios para evitar artefactos como el desgarro en la música electrónica. Establezca en 0.5 para desactivarlo. Disminuya el valor para aumentar la protección, pero puede reducir la precisión de la indexación |
111
+
112
+ ### Uso de la API
113
+ Primero, inicia el servidor.
114
+ ```sh
115
+ rvc-api
116
+ ```
117
+ o
118
+ ```sh
119
+ poetry run poe rvc-api
120
+ ```
121
+
122
+ #### Inferir audio
123
+
124
+ ##### Obtener como blob
125
+ ```sh
126
+ curl -X 'POST' \
127
+ 'http://127.0.0.1:8000/inference?res_type=blob' \
128
+ -H 'accept: application/json' \
129
+ -H 'Content-Type: multipart/form-data' \
130
+ -F 'modelpath={model.pth}' \
131
+ -F 'input={input audio path}'
132
+ ```
133
+
134
+ ##### Obtener como json (incluir tiempo)
135
+ ```sh
136
+ curl -X 'POST' \
137
+ 'http://127.0.0.1:8000/inference?res_type=json' \
138
+ -H 'accept: application/json' \
139
+ -H 'Content-Type: multipart/form-data' \
140
+ -F 'modelpath={model.pth}' \
141
+ -F 'input={input audio path}'
142
+ ```
143
+
144
+ ### Uso con Docker
145
+
146
+ Compilar y ejecutar usando el script:
147
+
148
+ ```bash
149
+ ./docker-run.sh
150
+ ```
151
+
152
+ **O** usar manuálmente:
153
+
154
+ 1. Compilar:
155
+
156
+ ```bash
157
+ docker build -t "rvc" .
158
+ ```
159
+
160
+ 2. Ejecutar:
161
+
162
+ ```bash
163
+ docker run -it \
164
+ -p 8000:8000 \
165
+ -v "${PWD}/assets/weights:/weights:ro" \
166
+ -v "${PWD}/assets/indices:/indices:ro" \
167
+ -v "${PWD}/assets/audios:/audios:ro" \
168
+ "rvc"
169
+ ```
170
+
171
+ Recuerda que los pesos (weights), índices y audios de entrada se almacenan en `directorio-actual/assets`
docs/jp/README.ja.md ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ <h1>Retrieval-based-Voice-Conversion</h1>
4
+ An easy-to-use Voice Conversion framework based on VITS.<br><br>
5
+
6
+ [![madewithlove](https://img.shields.io/badge/made_with-%E2%9D%A4-red?style=for-the-badge&labelColor=orange
7
+ )](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion)
8
+
9
+ <img src="https://counter.seku.su/cmoe?name=rvc&theme=r34" /><br>
10
+
11
+ [![Licence](https://img.shields.io/github/license/RVC-Project/Retrieval-based-Voice-Conversion?style=for-the-badge)](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion/blob/develop/LICENSE)
12
+
13
+ [![Discord](https://img.shields.io/badge/RVC%20Developers-Discord-7289DA?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/HcsmBBGyVk)
14
+
15
+ </div>
16
+
17
+ ------
18
+
19
+
20
+ > [!NOTE]
21
+ > 現在開発中です...rvcのライブラリとAPIを提供する予定です。
22
+
23
+ ## Installation and usage
24
+
25
+ ### Standard Setup
26
+
27
+ 最初にプロジェクトにディレクトリを作成します。`assets`フォルダには推論や学習に必要なモデル、`result`フォルダには学習の結果が保存されます。
28
+
29
+ ```sh
30
+ rvc init
31
+ ```
32
+
33
+ これにより、作業ディレクトリに`assets`フォルダと`.env`が作成されます。
34
+ > [!WARNING]
35
+ > この時、ディレクトリは空もしくは`assets`フォルダおよび`.env`ファイルがない状態にしてください
36
+
37
+ ### Custom Setup
38
+
39
+ 既にモデルをダウンロードしている場合や、これらの構成を変更したい場合、`.env`ファイルを編集してください。
40
+ まだ`.env`ファイルがない場合、
41
+
42
+ ```sh
43
+ rvc env create
44
+ ```
45
+
46
+ にて作成できます。
47
+
48
+ また、モデルをダウンロードするときは
49
+
50
+ ```sh
51
+ rvc dlmodel
52
+ ```
53
+ もしくは
54
+ ```
55
+ rvc dlmodel {download_dir}
56
+ ```
57
+
58
+ にてダウンロードできます。
59
+
60
+ 最後に、envファイルにてモデルの場所などを指定してあげれば、終了です!
61
+
62
+
63
+ ### CLI Usage
64
+
65
+ #### Inference Audio
66
+
67
+ ```sh
68
+ rvc infer -m {model.pth} -i {input.wav} -o {output.wav}
69
+ ```
70
+
71
+ | option | flag&nbsp; | type | default value | description |
72
+ |---------------|------------|--------------|---------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
73
+ | modelPath | -m | Path | *required | Model path or filename (reads in the directory set in env) |
74
+ | inputPath | -i | Path | *required | Input audio path or folder |
75
+ | outputPath | -o | Path | *required | Output audio path or folder |
76
+ | sid | -s | int | 0 | Speaker/Singer ID |
77
+ | f0_up_key | -fu | int | 0 | Transpose (integer, number of semitones, raise by an octave: 12, lower by an octave: -12) |
78
+ | f0_method | -fm | str | rmvpe | pitch extraction algorithm (pm, harvest, crepe, rmvpe |
79
+ | f0_file | -ff | Path \| None | None | F0 curve file (optional). One pitch per line. Replaces the default F0 and pitch modulation |
80
+ | index_file | -if | Path \| None | None | Path to the feature index file |
81
+ | index_rate | -if | float | 0.75 | Search feature ratio (controls accent strength, too high has artifacting) |
82
+ | filter_radius | -fr | int | 3 | If >=3: apply median filtering to the harvested pitch results. The value represents the filter radius and can reduce breathiness |
83
+ | resample_sr | -rsr | int | 0 | Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling |
84
+ | rms_mix_rate | -rmr | float | 0.25 | Adjust the volume envelope scaling. Closer to 0, the more it mimicks the volume of the original vocals. Can help mask noise and make volume sound more natural when set relatively low. Closer to 1 will be more of a consistently loud volume |
85
+ | protect | -p | float | 0.33 | Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy |
86
+
87
+
88
+ ### API Usage
89
+ 最初に、サーバーを立ち上げます。
90
+ ```sh
91
+ rvc-api
92
+ ```
93
+ または
94
+ ```sh
95
+ poetry run poe rvc-api
96
+ ```
97
+ にて実行されます。
98
+
99
+ #### Inference Audio
100
+
101
+ ##### blobでレスポンスを受け取る
102
+ ```sh
103
+ curl -X 'POST' \
104
+ 'http://127.0.0.1:8000/inference?res_type=blob' \
105
+ -H 'accept: application/json' \
106
+ -H 'Content-Type: multipart/form-data' \
107
+ -F 'modelpath={model.pth}' \
108
+ -F 'input={input audio path}'
109
+ ```
110
+
111
+ ##### jsonでレスポンス!(include time)
112
+ ```sh
113
+ curl -X 'POST' \
114
+ 'http://127.0.0.1:8000/inference?res_type=json' \
115
+ -H 'accept: application/json' \
116
+ -H 'Content-Type: multipart/form-data' \
117
+ -F 'modelpath={model.pth}' \
118
+ -F 'input={input audio path}'
119
+ ```
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "rvc"
3
+ version = "0.3.5"
4
+ description = "An easy-to-use Voice Conversion framework based on VITS."
5
+ authors = ["Ftps <[email protected]>"]
6
+ readme = "README.md"
7
+
8
+ [tool.poetry.urls]
9
+ github = "https://github.com/RVC-Project/Retrieval-based-Voice-Conversion"
10
+
11
+ [tool.poetry.dependencies]
12
+ python = "^3.10"
13
+ torch = "^2.1.0"
14
+ #fairseq = "^0.12.2"
15
+ fairseq = {git = "https://github.com/Tps-F/fairseq.git", branch="main"}
16
+ soundfile = "^0.12.1"
17
+ librosa = "^0.10.1"
18
+ praat-parselmouth = "^0.4.3"
19
+ pyworld = "^0.3.4"
20
+ torchcrepe = "^0.0.22"
21
+ av = "^11.0.0"
22
+ faiss-cpu = "^1.7.4"
23
+ python-dotenv = "^1.0.0"
24
+ pydub = "^0.25.1"
25
+ click = "^8.1.7"
26
+ tensorboardx = "^2.6.2.2"
27
+ poethepoet = "^0.24.4"
28
+ uvicorn = "^0.26.0"
29
+ fastapi = "^0.109.0"
30
+ python-multipart = "^0.0.6"
31
+ numba = "0.59.0rc1"
32
+
33
+ [tool.poetry.extras]
34
+ api = ["uvicorn", "fastapi"]
35
+
36
+ [tool.poetry.scripts]
37
+ rvc = "rvc.wrapper.cli.cli:main"
38
+
39
+ [tool.poe.tasks]
40
+ rvc-api = "uvicorn rvc.wrapper.api.api:app --host 0.0.0.0 --port 8000 --reload"
41
+
42
+ [build-system]
43
+ requires = ["poetry-core"]
44
+ build-backend = "poetry.core.masonry.api"
rvc/__init__.py ADDED
File without changes
rvc/configs/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from rvc.configs.config import Config
rvc/configs/config.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import logging
4
+ import os
5
+ import sys
6
+ from multiprocessing import cpu_count
7
+
8
+ import torch
9
+
10
+ try:
11
+ import intel_extension_for_pytorch as ipex
12
+
13
+ if torch.xpu.is_available():
14
+ from rvc.lib.ipex import ipex_init
15
+
16
+ ipex_init()
17
+ except (ImportError, Exception):
18
+ pass
19
+
20
+ logger: logging.Logger = logging.getLogger(__name__)
21
+
22
+
23
+ version_config_list: list = [
24
+ os.path.join(root, file)
25
+ for root, dirs, files in os.walk(os.path.dirname(os.path.abspath(__file__)))
26
+ for file in files
27
+ if file.endswith(".json")
28
+ ]
29
+
30
+
31
+ class Config:
32
+ def __new__(cls):
33
+ if not hasattr(cls, "_instance"):
34
+ cls._instance = super().__new__(cls)
35
+ return cls._instance
36
+
37
+ def __init__(self):
38
+ self.device: str = "cuda:0"
39
+ self.is_half: bool = True
40
+ self.use_jit: bool = False
41
+ self.n_cpu: int = cpu_count()
42
+ self.gpu_name: str | None = None
43
+ self.json_config = self.load_config_json()
44
+ self.gpu_mem: int | None = None
45
+ self.instead: str | None = None
46
+ (
47
+ self.python_cmd,
48
+ self.listen_port,
49
+ self.noparallel,
50
+ self.noautoopen,
51
+ self.dml,
52
+ ) = self.arg_parse()
53
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
54
+
55
+ @staticmethod
56
+ def load_config_json() -> dict:
57
+ return {
58
+ config_file: json.load(open(config_file, "r"))
59
+ for config_file in version_config_list
60
+ }
61
+
62
+ @staticmethod
63
+ def arg_parse() -> tuple:
64
+ parser: argparse.ArgumentParser = argparse.ArgumentParser()
65
+ parser.add_argument("--port", type=int, default=7865, help="Listen port")
66
+ parser.add_argument(
67
+ "--pycmd",
68
+ type=str,
69
+ default=sys.executable or "python",
70
+ help="Python command",
71
+ )
72
+ parser.add_argument(
73
+ "--noparallel", action="store_true", help="Disable parallel processing"
74
+ )
75
+ parser.add_argument(
76
+ "--noautoopen",
77
+ action="store_true",
78
+ help="Do not open in browser automatically",
79
+ )
80
+ parser.add_argument(
81
+ "--dml",
82
+ action="store_true",
83
+ help="torch_dml",
84
+ )
85
+ cmd_opts: argparse.Namespace
86
+ cmd_opts, _ = parser.parse_known_args()
87
+
88
+ cmd_opts.port = cmd_opts.port if 0 <= cmd_opts.port <= 65535 else 7865
89
+
90
+ return (
91
+ cmd_opts.pycmd,
92
+ cmd_opts.port,
93
+ cmd_opts.noparallel,
94
+ cmd_opts.noautoopen,
95
+ cmd_opts.dml,
96
+ )
97
+
98
+ @staticmethod
99
+ def has_mps() -> bool:
100
+ return torch.backends.mps.is_available() and not torch.zeros(1).to(
101
+ torch.device("mps")
102
+ )
103
+
104
+ @staticmethod
105
+ def has_xpu() -> bool:
106
+ return hasattr(torch, "xpu") and torch.xpu.is_available()
107
+
108
+ def params_config(self) -> tuple:
109
+ if self.gpu_mem is not None and self.gpu_mem <= 4:
110
+ x_pad = 1
111
+ x_query = 5
112
+ x_center = 30
113
+ x_max = 32
114
+ elif self.is_half:
115
+ # 6G PU_RAM conf
116
+ x_pad = 3
117
+ x_query = 10
118
+ x_center = 60
119
+ x_max = 65
120
+ else:
121
+ # 5G GPU_RAM conf
122
+ x_pad = 1
123
+ x_query = 6
124
+ x_center = 38
125
+ x_max = 41
126
+ return x_pad, x_query, x_center, x_max
127
+
128
+ def use_cuda(self) -> None:
129
+ if self.has_xpu():
130
+ self.device = self.instead = "xpu:0"
131
+ self.is_half = True
132
+ i_device = int(self.device.split(":")[-1])
133
+ self.gpu_name = torch.cuda.get_device_name(i_device)
134
+ if (
135
+ ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
136
+ or "P40" in self.gpu_name.upper()
137
+ or "P10" in self.gpu_name.upper()
138
+ or "1060" in self.gpu_name
139
+ or "1070" in self.gpu_name
140
+ or "1080" in self.gpu_name
141
+ ):
142
+ logger.info(f"Found GPU {self.gpu_name}, force to fp32")
143
+ self.is_half = False
144
+ self.use_fp32_config()
145
+ else:
146
+ logger.info(f"Found GPU {self.gpu_name}")
147
+ self.gpu_mem = int(
148
+ torch.cuda.get_device_properties(i_device).total_memory / 1024 / 1024 / 1024
149
+ + 0.4
150
+ )
151
+
152
+ def use_mps(self) -> None:
153
+ self.device = self.instead = "mps"
154
+ self.is_half = False
155
+ self.use_fp32_config()
156
+ self.params_config()
157
+
158
+ def use_dml(self) -> None:
159
+ import torch_directml
160
+
161
+ self.device = torch_directml.device(torch_directml.default_device())
162
+ self.is_half = False
163
+ self.params_config()
164
+
165
+ def use_cpu(self) -> None:
166
+ self.device = self.instead = "cpu"
167
+ self.is_half = False
168
+ self.use_fp32_config()
169
+ self.params_config()
170
+
171
+ def use_fp32_config(self) -> None:
172
+ for config_file, data in self.json_config.items():
173
+ try:
174
+ data["train"]["fp16_run"] = False
175
+ with open(config_file, "w") as json_file:
176
+ json.dump(data, json_file, indent=4)
177
+ except Exception as e:
178
+ logger.info(f"Error updating {config_file}: {str(e)}")
179
+ logger.info("overwrite configs.json")
180
+
181
+ def device_config(self) -> tuple:
182
+ if torch.cuda.is_available():
183
+ self.use_cuda()
184
+ elif self.has_mps():
185
+ logger.info("No supported Nvidia GPU found")
186
+ self.use_mps()
187
+ elif self.dml:
188
+ self.use_dml()
189
+ else:
190
+ logger.info("No supported Nvidia GPU found")
191
+ self.device = self.instead = "cpu"
192
+ self.is_half = False
193
+ self.use_fp32_config()
194
+
195
+ logger.info(f"Use {self.dml or self.instead} instead")
196
+ logger.info(f"is_half:{self.is_half}, device:{self.device}")
197
+ return self.params_config()
rvc/configs/v1/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,4,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
rvc/configs/v1/40k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 40000,
21
+ "filter_length": 2048,
22
+ "hop_length": 400,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 125,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
rvc/configs/v1/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 11520,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,6,2,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [16,16,4,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
rvc/configs/v2/32k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 12800,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 32000,
21
+ "filter_length": 1024,
22
+ "hop_length": 320,
23
+ "win_length": 1024,
24
+ "n_mel_channels": 80,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [10,8,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [20,16,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
rvc/configs/v2/48k.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "seed": 1234,
5
+ "epochs": 20000,
6
+ "learning_rate": 1e-4,
7
+ "betas": [0.8, 0.99],
8
+ "eps": 1e-9,
9
+ "batch_size": 4,
10
+ "fp16_run": true,
11
+ "lr_decay": 0.999875,
12
+ "segment_size": 17280,
13
+ "init_lr_ratio": 1,
14
+ "warmup_epochs": 0,
15
+ "c_mel": 45,
16
+ "c_kl": 1.0
17
+ },
18
+ "data": {
19
+ "max_wav_value": 32768.0,
20
+ "sampling_rate": 48000,
21
+ "filter_length": 2048,
22
+ "hop_length": 480,
23
+ "win_length": 2048,
24
+ "n_mel_channels": 128,
25
+ "mel_fmin": 0.0,
26
+ "mel_fmax": null
27
+ },
28
+ "model": {
29
+ "inter_channels": 192,
30
+ "hidden_channels": 192,
31
+ "filter_channels": 768,
32
+ "n_heads": 2,
33
+ "n_layers": 6,
34
+ "kernel_size": 3,
35
+ "p_dropout": 0,
36
+ "resblock": "1",
37
+ "resblock_kernel_sizes": [3,7,11],
38
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
39
+ "upsample_rates": [12,10,2,2],
40
+ "upsample_initial_channel": 512,
41
+ "upsample_kernel_sizes": [24,20,4,4],
42
+ "use_spectral_norm": false,
43
+ "gin_channels": 256,
44
+ "spk_embed_dim": 109
45
+ }
46
+ }
rvc/lib/audio.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import traceback
3
+ from io import BytesIO
4
+
5
+ import av
6
+ import librosa
7
+ import numpy as np
8
+
9
+
10
+ def wav2(i, o, format):
11
+ inp = av.open(i, "rb")
12
+ if format == "m4a":
13
+ format = "mp4"
14
+ out = av.open(o, "wb", format=format)
15
+ if format == "ogg":
16
+ format = "libvorbis"
17
+ if format == "mp4":
18
+ format = "aac"
19
+
20
+ ostream = out.add_stream(format)
21
+
22
+ for frame in inp.decode(audio=0):
23
+ for p in ostream.encode(frame):
24
+ out.mux(p)
25
+
26
+ for p in ostream.encode(None):
27
+ out.mux(p)
28
+
29
+ out.close()
30
+ inp.close()
31
+
32
+
33
+ def audio2(i, o, format, sr):
34
+ inp = av.open(i, "rb")
35
+ out = av.open(o, "wb", format=format)
36
+ if format == "ogg":
37
+ format = "libvorbis"
38
+ if format == "f32le":
39
+ format = "pcm_f32le"
40
+
41
+ ostream = out.add_stream(format, channels=1)
42
+ ostream.sample_rate = sr
43
+
44
+ for frame in inp.decode(audio=0):
45
+ for p in ostream.encode(frame):
46
+ out.mux(p)
47
+
48
+ out.close()
49
+ inp.close()
50
+
51
+
52
+ def load_audio(file, sr):
53
+ if not os.path.exists(file):
54
+ raise RuntimeError(
55
+ "You input a wrong audio path that does not exists, please fix it!"
56
+ )
57
+ try:
58
+ with open(file, "rb") as f:
59
+ with BytesIO() as out:
60
+ audio2(f, out, "f32le", sr)
61
+ return np.frombuffer(out.getvalue(), np.float32).flatten()
62
+
63
+ except AttributeError:
64
+ audio = file[1] / 32768.0
65
+ if len(audio.shape) == 2:
66
+ audio = np.mean(audio, -1)
67
+ return librosa.resample(audio, orig_sr=file[0], target_sr=16000)
68
+
69
+ except Exception:
70
+ raise RuntimeError(traceback.format_exc())
rvc/lib/infer_pack/attentions.py ADDED
@@ -0,0 +1,459 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ from typing import Optional
4
+
5
+ import numpy as np
6
+ import torch
7
+ from torch import nn
8
+ from torch.nn import functional as F
9
+
10
+ from rvc.lib.infer_pack import commons, modules
11
+ from rvc.lib.infer_pack.modules import LayerNorm
12
+
13
+
14
+ class Encoder(nn.Module):
15
+ def __init__(
16
+ self,
17
+ hidden_channels,
18
+ filter_channels,
19
+ n_heads,
20
+ n_layers,
21
+ kernel_size=1,
22
+ p_dropout=0.0,
23
+ window_size=10,
24
+ **kwargs
25
+ ):
26
+ super(Encoder, self).__init__()
27
+ self.hidden_channels = hidden_channels
28
+ self.filter_channels = filter_channels
29
+ self.n_heads = n_heads
30
+ self.n_layers = int(n_layers)
31
+ self.kernel_size = kernel_size
32
+ self.p_dropout = p_dropout
33
+ self.window_size = window_size
34
+
35
+ self.drop = nn.Dropout(p_dropout)
36
+ self.attn_layers = nn.ModuleList()
37
+ self.norm_layers_1 = nn.ModuleList()
38
+ self.ffn_layers = nn.ModuleList()
39
+ self.norm_layers_2 = nn.ModuleList()
40
+ for i in range(self.n_layers):
41
+ self.attn_layers.append(
42
+ MultiHeadAttention(
43
+ hidden_channels,
44
+ hidden_channels,
45
+ n_heads,
46
+ p_dropout=p_dropout,
47
+ window_size=window_size,
48
+ )
49
+ )
50
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
51
+ self.ffn_layers.append(
52
+ FFN(
53
+ hidden_channels,
54
+ hidden_channels,
55
+ filter_channels,
56
+ kernel_size,
57
+ p_dropout=p_dropout,
58
+ )
59
+ )
60
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
61
+
62
+ def forward(self, x, x_mask):
63
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
64
+ x = x * x_mask
65
+ zippep = zip(
66
+ self.attn_layers, self.norm_layers_1, self.ffn_layers, self.norm_layers_2
67
+ )
68
+ for attn_layers, norm_layers_1, ffn_layers, norm_layers_2 in zippep:
69
+ y = attn_layers(x, x, attn_mask)
70
+ y = self.drop(y)
71
+ x = norm_layers_1(x + y)
72
+
73
+ y = ffn_layers(x, x_mask)
74
+ y = self.drop(y)
75
+ x = norm_layers_2(x + y)
76
+ x = x * x_mask
77
+ return x
78
+
79
+
80
+ class Decoder(nn.Module):
81
+ def __init__(
82
+ self,
83
+ hidden_channels,
84
+ filter_channels,
85
+ n_heads,
86
+ n_layers,
87
+ kernel_size=1,
88
+ p_dropout=0.0,
89
+ proximal_bias=False,
90
+ proximal_init=True,
91
+ **kwargs
92
+ ):
93
+ super(Decoder, self).__init__()
94
+ self.hidden_channels = hidden_channels
95
+ self.filter_channels = filter_channels
96
+ self.n_heads = n_heads
97
+ self.n_layers = n_layers
98
+ self.kernel_size = kernel_size
99
+ self.p_dropout = p_dropout
100
+ self.proximal_bias = proximal_bias
101
+ self.proximal_init = proximal_init
102
+
103
+ self.drop = nn.Dropout(p_dropout)
104
+ self.self_attn_layers = nn.ModuleList()
105
+ self.norm_layers_0 = nn.ModuleList()
106
+ self.encdec_attn_layers = nn.ModuleList()
107
+ self.norm_layers_1 = nn.ModuleList()
108
+ self.ffn_layers = nn.ModuleList()
109
+ self.norm_layers_2 = nn.ModuleList()
110
+ for i in range(self.n_layers):
111
+ self.self_attn_layers.append(
112
+ MultiHeadAttention(
113
+ hidden_channels,
114
+ hidden_channels,
115
+ n_heads,
116
+ p_dropout=p_dropout,
117
+ proximal_bias=proximal_bias,
118
+ proximal_init=proximal_init,
119
+ )
120
+ )
121
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
122
+ self.encdec_attn_layers.append(
123
+ MultiHeadAttention(
124
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
125
+ )
126
+ )
127
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
128
+ self.ffn_layers.append(
129
+ FFN(
130
+ hidden_channels,
131
+ hidden_channels,
132
+ filter_channels,
133
+ kernel_size,
134
+ p_dropout=p_dropout,
135
+ causal=True,
136
+ )
137
+ )
138
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
139
+
140
+ def forward(self, x, x_mask, h, h_mask):
141
+ """
142
+ x: decoder input
143
+ h: encoder output
144
+ """
145
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
146
+ device=x.device, dtype=x.dtype
147
+ )
148
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
149
+ x = x * x_mask
150
+ for i in range(self.n_layers):
151
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
152
+ y = self.drop(y)
153
+ x = self.norm_layers_0[i](x + y)
154
+
155
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
156
+ y = self.drop(y)
157
+ x = self.norm_layers_1[i](x + y)
158
+
159
+ y = self.ffn_layers[i](x, x_mask)
160
+ y = self.drop(y)
161
+ x = self.norm_layers_2[i](x + y)
162
+ x = x * x_mask
163
+ return x
164
+
165
+
166
+ class MultiHeadAttention(nn.Module):
167
+ def __init__(
168
+ self,
169
+ channels,
170
+ out_channels,
171
+ n_heads,
172
+ p_dropout=0.0,
173
+ window_size=None,
174
+ heads_share=True,
175
+ block_length=None,
176
+ proximal_bias=False,
177
+ proximal_init=False,
178
+ ):
179
+ super(MultiHeadAttention, self).__init__()
180
+ assert channels % n_heads == 0
181
+
182
+ self.channels = channels
183
+ self.out_channels = out_channels
184
+ self.n_heads = n_heads
185
+ self.p_dropout = p_dropout
186
+ self.window_size = window_size
187
+ self.heads_share = heads_share
188
+ self.block_length = block_length
189
+ self.proximal_bias = proximal_bias
190
+ self.proximal_init = proximal_init
191
+ self.attn = None
192
+
193
+ self.k_channels = channels // n_heads
194
+ self.conv_q = nn.Conv1d(channels, channels, 1)
195
+ self.conv_k = nn.Conv1d(channels, channels, 1)
196
+ self.conv_v = nn.Conv1d(channels, channels, 1)
197
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
198
+ self.drop = nn.Dropout(p_dropout)
199
+
200
+ if window_size is not None:
201
+ n_heads_rel = 1 if heads_share else n_heads
202
+ rel_stddev = self.k_channels**-0.5
203
+ self.emb_rel_k = nn.Parameter(
204
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
205
+ * rel_stddev
206
+ )
207
+ self.emb_rel_v = nn.Parameter(
208
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
209
+ * rel_stddev
210
+ )
211
+
212
+ nn.init.xavier_uniform_(self.conv_q.weight)
213
+ nn.init.xavier_uniform_(self.conv_k.weight)
214
+ nn.init.xavier_uniform_(self.conv_v.weight)
215
+ if proximal_init:
216
+ with torch.no_grad():
217
+ self.conv_k.weight.copy_(self.conv_q.weight)
218
+ self.conv_k.bias.copy_(self.conv_q.bias)
219
+
220
+ def forward(
221
+ self, x: torch.Tensor, c: torch.Tensor, attn_mask: Optional[torch.Tensor] = None
222
+ ):
223
+ q = self.conv_q(x)
224
+ k = self.conv_k(c)
225
+ v = self.conv_v(c)
226
+
227
+ x, _ = self.attention(q, k, v, mask=attn_mask)
228
+
229
+ x = self.conv_o(x)
230
+ return x
231
+
232
+ def attention(
233
+ self,
234
+ query: torch.Tensor,
235
+ key: torch.Tensor,
236
+ value: torch.Tensor,
237
+ mask: Optional[torch.Tensor] = None,
238
+ ):
239
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
240
+ b, d, t_s = key.size()
241
+ t_t = query.size(2)
242
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
243
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
244
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
245
+
246
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
247
+ if self.window_size is not None:
248
+ assert (
249
+ t_s == t_t
250
+ ), "Relative attention is only available for self-attention."
251
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
252
+ rel_logits = self._matmul_with_relative_keys(
253
+ query / math.sqrt(self.k_channels), key_relative_embeddings
254
+ )
255
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
256
+ scores = scores + scores_local
257
+ if self.proximal_bias:
258
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
259
+ scores = scores + self._attention_bias_proximal(t_s).to(
260
+ device=scores.device, dtype=scores.dtype
261
+ )
262
+ if mask is not None:
263
+ scores = scores.masked_fill(mask == 0, -1e4)
264
+ if self.block_length is not None:
265
+ assert (
266
+ t_s == t_t
267
+ ), "Local attention is only available for self-attention."
268
+ block_mask = (
269
+ torch.ones_like(scores)
270
+ .triu(-self.block_length)
271
+ .tril(self.block_length)
272
+ )
273
+ scores = scores.masked_fill(block_mask == 0, -1e4)
274
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
275
+ p_attn = self.drop(p_attn)
276
+ output = torch.matmul(p_attn, value)
277
+ if self.window_size is not None:
278
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
279
+ value_relative_embeddings = self._get_relative_embeddings(
280
+ self.emb_rel_v, t_s
281
+ )
282
+ output = output + self._matmul_with_relative_values(
283
+ relative_weights, value_relative_embeddings
284
+ )
285
+ output = (
286
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
287
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
288
+ return output, p_attn
289
+
290
+ def _matmul_with_relative_values(self, x, y):
291
+ """
292
+ x: [b, h, l, m]
293
+ y: [h or 1, m, d]
294
+ ret: [b, h, l, d]
295
+ """
296
+ ret = torch.matmul(x, y.unsqueeze(0))
297
+ return ret
298
+
299
+ def _matmul_with_relative_keys(self, x, y):
300
+ """
301
+ x: [b, h, l, d]
302
+ y: [h or 1, m, d]
303
+ ret: [b, h, l, m]
304
+ """
305
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
306
+ return ret
307
+
308
+ def _get_relative_embeddings(self, relative_embeddings, length: int):
309
+ max_relative_position = 2 * self.window_size + 1
310
+ # Pad first before slice to avoid using cond ops.
311
+ pad_length: int = max(length - (self.window_size + 1), 0)
312
+ slice_start_position = max((self.window_size + 1) - length, 0)
313
+ slice_end_position = slice_start_position + 2 * length - 1
314
+ if pad_length > 0:
315
+ padded_relative_embeddings = F.pad(
316
+ relative_embeddings,
317
+ # commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
318
+ [0, 0, pad_length, pad_length, 0, 0],
319
+ )
320
+ else:
321
+ padded_relative_embeddings = relative_embeddings
322
+ used_relative_embeddings = padded_relative_embeddings[
323
+ :, slice_start_position:slice_end_position
324
+ ]
325
+ return used_relative_embeddings
326
+
327
+ def _relative_position_to_absolute_position(self, x):
328
+ """
329
+ x: [b, h, l, 2*l-1]
330
+ ret: [b, h, l, l]
331
+ """
332
+ batch, heads, length, _ = x.size()
333
+ # Concat columns of pad to shift from relative to absolute indexing.
334
+ x = F.pad(
335
+ x,
336
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])
337
+ [0, 1, 0, 0, 0, 0, 0, 0],
338
+ )
339
+
340
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
341
+ x_flat = x.view([batch, heads, length * 2 * length])
342
+ x_flat = F.pad(
343
+ x_flat,
344
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, int(length) - 1]])
345
+ [0, int(length) - 1, 0, 0, 0, 0],
346
+ )
347
+
348
+ # Reshape and slice out the padded elements.
349
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
350
+ :, :, :length, length - 1 :
351
+ ]
352
+ return x_final
353
+
354
+ def _absolute_position_to_relative_position(self, x):
355
+ """
356
+ x: [b, h, l, l]
357
+ ret: [b, h, l, 2*l-1]
358
+ """
359
+ batch, heads, length, _ = x.size()
360
+ # padd along column
361
+ x = F.pad(
362
+ x,
363
+ # commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, int(length) - 1]])
364
+ [0, int(length) - 1, 0, 0, 0, 0, 0, 0],
365
+ )
366
+ x_flat = x.view([batch, heads, int(length**2) + int(length * (length - 1))])
367
+ # add 0's in the beginning that will skew the elements after reshape
368
+ x_flat = F.pad(
369
+ x_flat,
370
+ # commons.convert_pad_shape([[0, 0], [0, 0], [int(length), 0]])
371
+ [length, 0, 0, 0, 0, 0],
372
+ )
373
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
374
+ return x_final
375
+
376
+ def _attention_bias_proximal(self, length: int):
377
+ """Bias for self-attention to encourage attention to close positions.
378
+ Args:
379
+ length: an integer scalar.
380
+ Returns:
381
+ a Tensor with shape [1, 1, length, length]
382
+ """
383
+ r = torch.arange(length, dtype=torch.float32)
384
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
385
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
386
+
387
+
388
+ class FFN(nn.Module):
389
+ def __init__(
390
+ self,
391
+ in_channels,
392
+ out_channels,
393
+ filter_channels,
394
+ kernel_size,
395
+ p_dropout=0.0,
396
+ activation: str = None,
397
+ causal=False,
398
+ ):
399
+ super(FFN, self).__init__()
400
+ self.in_channels = in_channels
401
+ self.out_channels = out_channels
402
+ self.filter_channels = filter_channels
403
+ self.kernel_size = kernel_size
404
+ self.p_dropout = p_dropout
405
+ self.activation = activation
406
+ self.causal = causal
407
+ self.is_activation = True if activation == "gelu" else False
408
+ # if causal:
409
+ # self.padding = self._causal_padding
410
+ # else:
411
+ # self.padding = self._same_padding
412
+
413
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
414
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
415
+ self.drop = nn.Dropout(p_dropout)
416
+
417
+ def padding(self, x: torch.Tensor, x_mask: torch.Tensor) -> torch.Tensor:
418
+ if self.causal:
419
+ padding = self._causal_padding(x * x_mask)
420
+ else:
421
+ padding = self._same_padding(x * x_mask)
422
+ return padding
423
+
424
+ def forward(self, x: torch.Tensor, x_mask: torch.Tensor):
425
+ x = self.conv_1(self.padding(x, x_mask))
426
+ if self.is_activation:
427
+ x = x * torch.sigmoid(1.702 * x)
428
+ else:
429
+ x = torch.relu(x)
430
+ x = self.drop(x)
431
+
432
+ x = self.conv_2(self.padding(x, x_mask))
433
+ return x * x_mask
434
+
435
+ def _causal_padding(self, x):
436
+ if self.kernel_size == 1:
437
+ return x
438
+ pad_l: int = self.kernel_size - 1
439
+ pad_r: int = 0
440
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
441
+ x = F.pad(
442
+ x,
443
+ # commons.convert_pad_shape(padding)
444
+ [pad_l, pad_r, 0, 0, 0, 0],
445
+ )
446
+ return x
447
+
448
+ def _same_padding(self, x):
449
+ if self.kernel_size == 1:
450
+ return x
451
+ pad_l: int = (self.kernel_size - 1) // 2
452
+ pad_r: int = self.kernel_size // 2
453
+ # padding = [[0, 0], [0, 0], [pad_l, pad_r]]
454
+ x = F.pad(
455
+ x,
456
+ # commons.convert_pad_shape(padding)
457
+ [pad_l, pad_r, 0, 0, 0, 0],
458
+ )
459
+ return x
rvc/lib/infer_pack/commons.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Optional
3
+
4
+ import numpy as np
5
+ import torch
6
+ from torch import nn
7
+ from torch.nn import functional as F
8
+
9
+
10
+ def init_weights(m, mean=0.0, std=0.01):
11
+ classname = m.__class__.__name__
12
+ if classname.find("Conv") != -1:
13
+ m.weight.data.normal_(mean, std)
14
+
15
+
16
+ def get_padding(kernel_size, dilation=1):
17
+ return int((kernel_size * dilation - dilation) / 2)
18
+
19
+
20
+ # def convert_pad_shape(pad_shape):
21
+ # l = pad_shape[::-1]
22
+ # pad_shape = [item for sublist in l for item in sublist]
23
+ # return pad_shape
24
+
25
+
26
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
27
+ """KL(P||Q)"""
28
+ kl = (logs_q - logs_p) - 0.5
29
+ kl += (
30
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
31
+ )
32
+ return kl
33
+
34
+
35
+ def rand_gumbel(shape):
36
+ """Sample from the Gumbel distribution, protect from overflows."""
37
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
38
+ return -torch.log(-torch.log(uniform_samples))
39
+
40
+
41
+ def rand_gumbel_like(x):
42
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
43
+ return g
44
+
45
+
46
+ def slice_segments(x, ids_str, segment_size=4):
47
+ ret = torch.zeros_like(x[:, :, :segment_size])
48
+ for i in range(x.size(0)):
49
+ idx_str = ids_str[i]
50
+ idx_end = idx_str + segment_size
51
+ ret[i] = x[i, :, idx_str:idx_end]
52
+ return ret
53
+
54
+
55
+ def slice_segments2(x, ids_str, segment_size=4):
56
+ ret = torch.zeros_like(x[:, :segment_size])
57
+ for i in range(x.size(0)):
58
+ idx_str = ids_str[i]
59
+ idx_end = idx_str + segment_size
60
+ ret[i] = x[i, idx_str:idx_end]
61
+ return ret
62
+
63
+
64
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
65
+ b, d, t = x.size()
66
+ if x_lengths is None:
67
+ x_lengths = t
68
+ ids_str_max = x_lengths - segment_size + 1
69
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
70
+ ret = slice_segments(x, ids_str, segment_size)
71
+ return ret, ids_str
72
+
73
+
74
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
75
+ position = torch.arange(length, dtype=torch.float)
76
+ num_timescales = channels // 2
77
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
78
+ num_timescales - 1
79
+ )
80
+ inv_timescales = min_timescale * torch.exp(
81
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
82
+ )
83
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
84
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
85
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
86
+ signal = signal.view(1, channels, length)
87
+ return signal
88
+
89
+
90
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
91
+ b, channels, length = x.size()
92
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
93
+ return x + signal.to(dtype=x.dtype, device=x.device)
94
+
95
+
96
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
97
+ b, channels, length = x.size()
98
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
99
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
100
+
101
+
102
+ def subsequent_mask(length):
103
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
104
+ return mask
105
+
106
+
107
+ @torch.jit.script
108
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
109
+ n_channels_int = n_channels[0]
110
+ in_act = input_a + input_b
111
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
112
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
113
+ acts = t_act * s_act
114
+ return acts
115
+
116
+
117
+ # def convert_pad_shape(pad_shape):
118
+ # l = pad_shape[::-1]
119
+ # pad_shape = [item for sublist in l for item in sublist]
120
+ # return pad_shape
121
+
122
+
123
+ def convert_pad_shape(pad_shape: List[List[int]]) -> List[int]:
124
+ return torch.tensor(pad_shape).flip(0).reshape(-1).int().tolist()
125
+
126
+
127
+ def shift_1d(x):
128
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
129
+ return x
130
+
131
+
132
+ def sequence_mask(length: torch.Tensor, max_length: Optional[int] = None):
133
+ if max_length is None:
134
+ max_length = length.max()
135
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
136
+ return x.unsqueeze(0) < length.unsqueeze(1)
137
+
138
+
139
+ def generate_path(duration, mask):
140
+ """
141
+ duration: [b, 1, t_x]
142
+ mask: [b, 1, t_y, t_x]
143
+ """
144
+ device = duration.device
145
+
146
+ b, _, t_y, t_x = mask.shape
147
+ cum_duration = torch.cumsum(duration, -1)
148
+
149
+ cum_duration_flat = cum_duration.view(b * t_x)
150
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
151
+ path = path.view(b, t_x, t_y)
152
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
153
+ path = path.unsqueeze(1).transpose(2, 3) * mask
154
+ return path
155
+
156
+
157
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
158
+ if isinstance(parameters, torch.Tensor):
159
+ parameters = [parameters]
160
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
161
+ norm_type = float(norm_type)
162
+ if clip_value is not None:
163
+ clip_value = float(clip_value)
164
+
165
+ total_norm = 0
166
+ for p in parameters:
167
+ param_norm = p.grad.data.norm(norm_type)
168
+ total_norm += param_norm.item() ** norm_type
169
+ if clip_value is not None:
170
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
171
+ total_norm = total_norm ** (1.0 / norm_type)
172
+ return total_norm
rvc/lib/infer_pack/models.py ADDED
@@ -0,0 +1,1426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+ from typing import Optional
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ import numpy as np
8
+ import torch
9
+ from torch import nn
10
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
11
+ from torch.nn import functional as F
12
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
13
+
14
+ from rvc.lib.infer_pack import attentions, commons, modules
15
+ from rvc.lib.infer_pack.commons import get_padding, init_weights
16
+
17
+ has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())
18
+
19
+
20
+ class TextEncoder256(nn.Module):
21
+ def __init__(
22
+ self,
23
+ out_channels,
24
+ hidden_channels,
25
+ filter_channels,
26
+ n_heads,
27
+ n_layers,
28
+ kernel_size,
29
+ p_dropout,
30
+ f0=True,
31
+ ):
32
+ super(TextEncoder256, self).__init__()
33
+ self.out_channels = out_channels
34
+ self.hidden_channels = hidden_channels
35
+ self.filter_channels = filter_channels
36
+ self.n_heads = n_heads
37
+ self.n_layers = n_layers
38
+ self.kernel_size = kernel_size
39
+ self.p_dropout = float(p_dropout)
40
+ self.emb_phone = nn.Linear(256, hidden_channels)
41
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
42
+ if f0 == True:
43
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
44
+ self.encoder = attentions.Encoder(
45
+ hidden_channels,
46
+ filter_channels,
47
+ n_heads,
48
+ n_layers,
49
+ kernel_size,
50
+ float(p_dropout),
51
+ )
52
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
53
+
54
+ def forward(
55
+ self, phone: torch.Tensor, pitch: Optional[torch.Tensor], lengths: torch.Tensor
56
+ ):
57
+ if pitch is None:
58
+ x = self.emb_phone(phone)
59
+ else:
60
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
61
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
62
+ x = self.lrelu(x)
63
+ x = torch.transpose(x, 1, -1) # [b, h, t]
64
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
65
+ x.dtype
66
+ )
67
+ x = self.encoder(x * x_mask, x_mask)
68
+ stats = self.proj(x) * x_mask
69
+
70
+ m, logs = torch.split(stats, self.out_channels, dim=1)
71
+ return m, logs, x_mask
72
+
73
+
74
+ class TextEncoder768(nn.Module):
75
+ def __init__(
76
+ self,
77
+ out_channels,
78
+ hidden_channels,
79
+ filter_channels,
80
+ n_heads,
81
+ n_layers,
82
+ kernel_size,
83
+ p_dropout,
84
+ f0=True,
85
+ ):
86
+ super(TextEncoder768, self).__init__()
87
+ self.out_channels = out_channels
88
+ self.hidden_channels = hidden_channels
89
+ self.filter_channels = filter_channels
90
+ self.n_heads = n_heads
91
+ self.n_layers = n_layers
92
+ self.kernel_size = kernel_size
93
+ self.p_dropout = float(p_dropout)
94
+ self.emb_phone = nn.Linear(768, hidden_channels)
95
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
96
+ if f0 == True:
97
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
98
+ self.encoder = attentions.Encoder(
99
+ hidden_channels,
100
+ filter_channels,
101
+ n_heads,
102
+ n_layers,
103
+ kernel_size,
104
+ float(p_dropout),
105
+ )
106
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
107
+
108
+ def forward(self, phone: torch.Tensor, pitch: torch.Tensor, lengths: torch.Tensor):
109
+ if pitch is None:
110
+ x = self.emb_phone(phone)
111
+ else:
112
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
113
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
114
+ x = self.lrelu(x)
115
+ x = torch.transpose(x, 1, -1) # [b, h, t]
116
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
117
+ x.dtype
118
+ )
119
+ x = self.encoder(x * x_mask, x_mask)
120
+ stats = self.proj(x) * x_mask
121
+
122
+ m, logs = torch.split(stats, self.out_channels, dim=1)
123
+ return m, logs, x_mask
124
+
125
+
126
+ class ResidualCouplingBlock(nn.Module):
127
+ def __init__(
128
+ self,
129
+ channels,
130
+ hidden_channels,
131
+ kernel_size,
132
+ dilation_rate,
133
+ n_layers,
134
+ n_flows=4,
135
+ gin_channels=0,
136
+ ):
137
+ super(ResidualCouplingBlock, self).__init__()
138
+ self.channels = channels
139
+ self.hidden_channels = hidden_channels
140
+ self.kernel_size = kernel_size
141
+ self.dilation_rate = dilation_rate
142
+ self.n_layers = n_layers
143
+ self.n_flows = n_flows
144
+ self.gin_channels = gin_channels
145
+
146
+ self.flows = nn.ModuleList()
147
+ for i in range(n_flows):
148
+ self.flows.append(
149
+ modules.ResidualCouplingLayer(
150
+ channels,
151
+ hidden_channels,
152
+ kernel_size,
153
+ dilation_rate,
154
+ n_layers,
155
+ gin_channels=gin_channels,
156
+ mean_only=True,
157
+ )
158
+ )
159
+ self.flows.append(modules.Flip())
160
+
161
+ def forward(
162
+ self,
163
+ x: torch.Tensor,
164
+ x_mask: torch.Tensor,
165
+ g: Optional[torch.Tensor] = None,
166
+ reverse: bool = False,
167
+ ):
168
+ if not reverse:
169
+ for flow in self.flows:
170
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
171
+ else:
172
+ for flow in self.flows[::-1]:
173
+ x, _ = flow.forward(x, x_mask, g=g, reverse=reverse)
174
+ return x
175
+
176
+ def remove_weight_norm(self):
177
+ for i in range(self.n_flows):
178
+ self.flows[i * 2].remove_weight_norm()
179
+
180
+ def __prepare_scriptable__(self):
181
+ for i in range(self.n_flows):
182
+ for hook in self.flows[i * 2]._forward_pre_hooks.values():
183
+ if (
184
+ hook.__module__ == "torch.nn.utils.weight_norm"
185
+ and hook.__class__.__name__ == "WeightNorm"
186
+ ):
187
+ torch.nn.utils.remove_weight_norm(self.flows[i * 2])
188
+
189
+ return self
190
+
191
+
192
+ class PosteriorEncoder(nn.Module):
193
+ def __init__(
194
+ self,
195
+ in_channels,
196
+ out_channels,
197
+ hidden_channels,
198
+ kernel_size,
199
+ dilation_rate,
200
+ n_layers,
201
+ gin_channels=0,
202
+ ):
203
+ super(PosteriorEncoder, self).__init__()
204
+ self.in_channels = in_channels
205
+ self.out_channels = out_channels
206
+ self.hidden_channels = hidden_channels
207
+ self.kernel_size = kernel_size
208
+ self.dilation_rate = dilation_rate
209
+ self.n_layers = n_layers
210
+ self.gin_channels = gin_channels
211
+
212
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
213
+ self.enc = modules.WN(
214
+ hidden_channels,
215
+ kernel_size,
216
+ dilation_rate,
217
+ n_layers,
218
+ gin_channels=gin_channels,
219
+ )
220
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
221
+
222
+ def forward(
223
+ self, x: torch.Tensor, x_lengths: torch.Tensor, g: Optional[torch.Tensor] = None
224
+ ):
225
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
226
+ x.dtype
227
+ )
228
+ x = self.pre(x) * x_mask
229
+ x = self.enc(x, x_mask, g=g)
230
+ stats = self.proj(x) * x_mask
231
+ m, logs = torch.split(stats, self.out_channels, dim=1)
232
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
233
+ return z, m, logs, x_mask
234
+
235
+ def remove_weight_norm(self):
236
+ self.enc.remove_weight_norm()
237
+
238
+ def __prepare_scriptable__(self):
239
+ for hook in self.enc._forward_pre_hooks.values():
240
+ if (
241
+ hook.__module__ == "torch.nn.utils.weight_norm"
242
+ and hook.__class__.__name__ == "WeightNorm"
243
+ ):
244
+ torch.nn.utils.remove_weight_norm(self.enc)
245
+ return self
246
+
247
+
248
+ class Generator(torch.nn.Module):
249
+ def __init__(
250
+ self,
251
+ initial_channel,
252
+ resblock,
253
+ resblock_kernel_sizes,
254
+ resblock_dilation_sizes,
255
+ upsample_rates,
256
+ upsample_initial_channel,
257
+ upsample_kernel_sizes,
258
+ gin_channels=0,
259
+ ):
260
+ super(Generator, self).__init__()
261
+ self.num_kernels = len(resblock_kernel_sizes)
262
+ self.num_upsamples = len(upsample_rates)
263
+ self.conv_pre = Conv1d(
264
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
265
+ )
266
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
267
+
268
+ self.ups = nn.ModuleList()
269
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
270
+ self.ups.append(
271
+ weight_norm(
272
+ ConvTranspose1d(
273
+ upsample_initial_channel // (2**i),
274
+ upsample_initial_channel // (2 ** (i + 1)),
275
+ k,
276
+ u,
277
+ padding=(k - u) // 2,
278
+ )
279
+ )
280
+ )
281
+
282
+ self.resblocks = nn.ModuleList()
283
+ for i in range(len(self.ups)):
284
+ ch = upsample_initial_channel // (2 ** (i + 1))
285
+ for j, (k, d) in enumerate(
286
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
287
+ ):
288
+ self.resblocks.append(resblock(ch, k, d))
289
+
290
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
291
+ self.ups.apply(init_weights)
292
+
293
+ if gin_channels != 0:
294
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
295
+
296
+ def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
297
+ x = self.conv_pre(x)
298
+ if g is not None:
299
+ x = x + self.cond(g)
300
+
301
+ for i in range(self.num_upsamples):
302
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
303
+ x = self.ups[i](x)
304
+ xs = None
305
+ for j in range(self.num_kernels):
306
+ if xs is None:
307
+ xs = self.resblocks[i * self.num_kernels + j](x)
308
+ else:
309
+ xs += self.resblocks[i * self.num_kernels + j](x)
310
+ x = xs / self.num_kernels
311
+ x = F.leaky_relu(x)
312
+ x = self.conv_post(x)
313
+ x = torch.tanh(x)
314
+
315
+ return x
316
+
317
+ def __prepare_scriptable__(self):
318
+ for l in self.ups:
319
+ for hook in l._forward_pre_hooks.values():
320
+ # The hook we want to remove is an instance of WeightNorm class, so
321
+ # normally we would do `if isinstance(...)` but this class is not accessible
322
+ # because of shadowing, so we check the module name directly.
323
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
324
+ if (
325
+ hook.__module__ == "torch.nn.utils.weight_norm"
326
+ and hook.__class__.__name__ == "WeightNorm"
327
+ ):
328
+ torch.nn.utils.remove_weight_norm(l)
329
+
330
+ for l in self.resblocks:
331
+ for hook in l._forward_pre_hooks.values():
332
+ if (
333
+ hook.__module__ == "torch.nn.utils.weight_norm"
334
+ and hook.__class__.__name__ == "WeightNorm"
335
+ ):
336
+ torch.nn.utils.remove_weight_norm(l)
337
+ return self
338
+
339
+ def remove_weight_norm(self):
340
+ for l in self.ups:
341
+ remove_weight_norm(l)
342
+ for l in self.resblocks:
343
+ l.remove_weight_norm()
344
+
345
+
346
+ class SineGen(torch.nn.Module):
347
+ """Definition of sine generator
348
+ SineGen(samp_rate, harmonic_num = 0,
349
+ sine_amp = 0.1, noise_std = 0.003,
350
+ voiced_threshold = 0,
351
+ flag_for_pulse=False)
352
+ samp_rate: sampling rate in Hz
353
+ harmonic_num: number of harmonic overtones (default 0)
354
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
355
+ noise_std: std of Gaussian noise (default 0.003)
356
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
357
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
358
+ Note: when flag_for_pulse is True, the first time step of a voiced
359
+ segment is always sin(torch.pi) or cos(0)
360
+ """
361
+
362
+ def __init__(
363
+ self,
364
+ samp_rate,
365
+ harmonic_num=0,
366
+ sine_amp=0.1,
367
+ noise_std=0.003,
368
+ voiced_threshold=0,
369
+ flag_for_pulse=False,
370
+ ):
371
+ super(SineGen, self).__init__()
372
+ self.sine_amp = sine_amp
373
+ self.noise_std = noise_std
374
+ self.harmonic_num = harmonic_num
375
+ self.dim = self.harmonic_num + 1
376
+ self.sampling_rate = samp_rate
377
+ self.voiced_threshold = voiced_threshold
378
+
379
+ def _f02uv(self, f0):
380
+ # generate uv signal
381
+ uv = torch.ones_like(f0)
382
+ uv = uv * (f0 > self.voiced_threshold)
383
+ if uv.device.type == "privateuseone": # for DirectML
384
+ uv = uv.float()
385
+ return uv
386
+
387
+ def forward(self, f0: torch.Tensor, upp: int):
388
+ """sine_tensor, uv = forward(f0)
389
+ input F0: tensor(batchsize=1, length, dim=1)
390
+ f0 for unvoiced steps should be 0
391
+ output sine_tensor: tensor(batchsize=1, length, dim)
392
+ output uv: tensor(batchsize=1, length, 1)
393
+ """
394
+ with torch.no_grad():
395
+ device = next(self.parameters(), None)
396
+ if device is not None:
397
+ device = device.device
398
+ else:
399
+ device = f0.device
400
+ align_corners = device.type != "xpu"
401
+ f0 = f0[:, None].transpose(1, 2)
402
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
403
+ # fundamental component
404
+ f0_buf[:, :, 0] = f0[:, :, 0]
405
+ for idx in range(self.harmonic_num):
406
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
407
+ idx + 2
408
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
409
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
410
+ rand_ini = torch.rand(
411
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
412
+ )
413
+ rand_ini[:, 0] = 0
414
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
415
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
416
+ tmp_over_one *= upp
417
+ tmp_over_one = F.interpolate(
418
+ tmp_over_one.transpose(2, 1),
419
+ scale_factor=float(upp),
420
+ mode="linear",
421
+ align_corners=align_corners,
422
+ ).transpose(2, 1)
423
+ rad_values = F.interpolate(
424
+ rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
425
+ ).transpose(
426
+ 2, 1
427
+ ) #######
428
+ tmp_over_one %= 1
429
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
430
+ cumsum_shift = torch.zeros_like(rad_values)
431
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
432
+ sine_waves = torch.sin(
433
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
434
+ )
435
+ sine_waves = sine_waves * self.sine_amp
436
+ uv = self._f02uv(f0)
437
+ uv = F.interpolate(
438
+ uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
439
+ ).transpose(2, 1)
440
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
441
+ noise = noise_amp * torch.randn_like(sine_waves)
442
+ sine_waves = sine_waves * uv + noise
443
+ return sine_waves, uv, noise
444
+
445
+
446
+ class SourceModuleHnNSF(torch.nn.Module):
447
+ """SourceModule for hn-nsf
448
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
449
+ add_noise_std=0.003, voiced_threshod=0)
450
+ sampling_rate: sampling_rate in Hz
451
+ harmonic_num: number of harmonic above F0 (default: 0)
452
+ sine_amp: amplitude of sine source signal (default: 0.1)
453
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
454
+ note that amplitude of noise in unvoiced is decided
455
+ by sine_amp
456
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
457
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
458
+ F0_sampled (batchsize, length, 1)
459
+ Sine_source (batchsize, length, 1)
460
+ noise_source (batchsize, length 1)
461
+ uv (batchsize, length, 1)
462
+ """
463
+
464
+ def __init__(
465
+ self,
466
+ sampling_rate,
467
+ harmonic_num=0,
468
+ sine_amp=0.1,
469
+ add_noise_std=0.003,
470
+ voiced_threshod=0,
471
+ is_half=True,
472
+ ):
473
+ super(SourceModuleHnNSF, self).__init__()
474
+
475
+ self.sine_amp = sine_amp
476
+ self.noise_std = add_noise_std
477
+ self.is_half = is_half
478
+ # to produce sine waveforms
479
+ self.l_sin_gen = SineGen(
480
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
481
+ )
482
+
483
+ # to merge source harmonics into a single excitation
484
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
485
+ self.l_tanh = torch.nn.Tanh()
486
+ # self.ddtype:int = -1
487
+
488
+ def forward(self, x: torch.Tensor, upp: int = 1):
489
+ # if self.ddtype ==-1:
490
+ # self.ddtype = self.l_linear.weight.dtype
491
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
492
+ # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
493
+ # if self.is_half:
494
+ # sine_wavs = sine_wavs.half()
495
+ # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
496
+ # print(sine_wavs.dtype,self.ddtype)
497
+ # if sine_wavs.dtype != self.l_linear.weight.dtype:
498
+ sine_wavs = sine_wavs.to(dtype=self.l_linear.weight.dtype)
499
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
500
+ return sine_merge, None, None # noise, uv
501
+
502
+
503
+ class GeneratorNSF(torch.nn.Module):
504
+ def __init__(
505
+ self,
506
+ initial_channel,
507
+ resblock,
508
+ resblock_kernel_sizes,
509
+ resblock_dilation_sizes,
510
+ upsample_rates,
511
+ upsample_initial_channel,
512
+ upsample_kernel_sizes,
513
+ gin_channels,
514
+ sr,
515
+ is_half=False,
516
+ ):
517
+ super(GeneratorNSF, self).__init__()
518
+ self.num_kernels = len(resblock_kernel_sizes)
519
+ self.num_upsamples = len(upsample_rates)
520
+
521
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=math.prod(upsample_rates))
522
+ self.m_source = SourceModuleHnNSF(
523
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
524
+ )
525
+ self.noise_convs = nn.ModuleList()
526
+ self.conv_pre = Conv1d(
527
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
528
+ )
529
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
530
+
531
+ self.ups = nn.ModuleList()
532
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
533
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
534
+ self.ups.append(
535
+ weight_norm(
536
+ ConvTranspose1d(
537
+ upsample_initial_channel // (2**i),
538
+ upsample_initial_channel // (2 ** (i + 1)),
539
+ k,
540
+ u,
541
+ padding=(k - u) // 2,
542
+ )
543
+ )
544
+ )
545
+ if i + 1 < len(upsample_rates):
546
+ stride_f0 = math.prod(upsample_rates[i + 1 :])
547
+ self.noise_convs.append(
548
+ Conv1d(
549
+ 1,
550
+ c_cur,
551
+ kernel_size=stride_f0 * 2,
552
+ stride=stride_f0,
553
+ padding=stride_f0 // 2,
554
+ )
555
+ )
556
+ else:
557
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
558
+
559
+ self.resblocks = nn.ModuleList()
560
+ for i in range(len(self.ups)):
561
+ ch = upsample_initial_channel // (2 ** (i + 1))
562
+ for j, (k, d) in enumerate(
563
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
564
+ ):
565
+ self.resblocks.append(resblock(ch, k, d))
566
+
567
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
568
+ self.ups.apply(init_weights)
569
+
570
+ if gin_channels != 0:
571
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
572
+
573
+ self.upp = math.prod(upsample_rates)
574
+
575
+ self.lrelu_slope = modules.LRELU_SLOPE
576
+
577
+ def forward(self, x, f0, g: Optional[torch.Tensor] = None):
578
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
579
+ har_source = har_source.transpose(1, 2)
580
+ x = self.conv_pre(x)
581
+ if g is not None:
582
+ x = x + self.cond(g)
583
+ # torch.jit.script() does not support direct indexing of torch modules
584
+ # That's why I wrote this
585
+ for i, (ups, noise_convs) in enumerate(zip(self.ups, self.noise_convs)):
586
+ if i < self.num_upsamples:
587
+ x = F.leaky_relu(x, self.lrelu_slope)
588
+ x = ups(x)
589
+ x_source = noise_convs(har_source)
590
+ x = x + x_source
591
+ xs: Optional[torch.Tensor] = None
592
+ l = [i * self.num_kernels + j for j in range(self.num_kernels)]
593
+ for j, resblock in enumerate(self.resblocks):
594
+ if j in l:
595
+ if xs is None:
596
+ xs = resblock(x)
597
+ else:
598
+ xs += resblock(x)
599
+ # This assertion cannot be ignored! \
600
+ # If ignored, it will cause torch.jit.script() compilation errors
601
+ assert isinstance(xs, torch.Tensor)
602
+ x = xs / self.num_kernels
603
+ x = F.leaky_relu(x)
604
+ x = self.conv_post(x)
605
+ x = torch.tanh(x)
606
+ return x
607
+
608
+ def remove_weight_norm(self):
609
+ for l in self.ups:
610
+ remove_weight_norm(l)
611
+ for l in self.resblocks:
612
+ l.remove_weight_norm()
613
+
614
+ def __prepare_scriptable__(self):
615
+ for l in self.ups:
616
+ for hook in l._forward_pre_hooks.values():
617
+ # The hook we want to remove is an instance of WeightNorm class, so
618
+ # normally we would do `if isinstance(...)` but this class is not accessible
619
+ # because of shadowing, so we check the module name directly.
620
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
621
+ if (
622
+ hook.__module__ == "torch.nn.utils.weight_norm"
623
+ and hook.__class__.__name__ == "WeightNorm"
624
+ ):
625
+ torch.nn.utils.remove_weight_norm(l)
626
+ for l in self.resblocks:
627
+ for hook in self.resblocks._forward_pre_hooks.values():
628
+ if (
629
+ hook.__module__ == "torch.nn.utils.weight_norm"
630
+ and hook.__class__.__name__ == "WeightNorm"
631
+ ):
632
+ torch.nn.utils.remove_weight_norm(l)
633
+ return self
634
+
635
+
636
+ sr2sr = {
637
+ "32k": 32000,
638
+ "40k": 40000,
639
+ "48k": 48000,
640
+ }
641
+
642
+
643
+ class SynthesizerTrnMs256NSFsid(nn.Module):
644
+ def __init__(
645
+ self,
646
+ spec_channels,
647
+ segment_size,
648
+ inter_channels,
649
+ hidden_channels,
650
+ filter_channels,
651
+ n_heads,
652
+ n_layers,
653
+ kernel_size,
654
+ p_dropout,
655
+ resblock,
656
+ resblock_kernel_sizes,
657
+ resblock_dilation_sizes,
658
+ upsample_rates,
659
+ upsample_initial_channel,
660
+ upsample_kernel_sizes,
661
+ spk_embed_dim,
662
+ gin_channels,
663
+ sr,
664
+ **kwargs
665
+ ):
666
+ super(SynthesizerTrnMs256NSFsid, self).__init__()
667
+ if isinstance(sr, str):
668
+ sr = sr2sr[sr]
669
+ self.spec_channels = spec_channels
670
+ self.inter_channels = inter_channels
671
+ self.hidden_channels = hidden_channels
672
+ self.filter_channels = filter_channels
673
+ self.n_heads = n_heads
674
+ self.n_layers = n_layers
675
+ self.kernel_size = kernel_size
676
+ self.p_dropout = float(p_dropout)
677
+ self.resblock = resblock
678
+ self.resblock_kernel_sizes = resblock_kernel_sizes
679
+ self.resblock_dilation_sizes = resblock_dilation_sizes
680
+ self.upsample_rates = upsample_rates
681
+ self.upsample_initial_channel = upsample_initial_channel
682
+ self.upsample_kernel_sizes = upsample_kernel_sizes
683
+ self.segment_size = segment_size
684
+ self.gin_channels = gin_channels
685
+ # self.hop_length = hop_length#
686
+ self.spk_embed_dim = spk_embed_dim
687
+ self.enc_p = TextEncoder256(
688
+ inter_channels,
689
+ hidden_channels,
690
+ filter_channels,
691
+ n_heads,
692
+ n_layers,
693
+ kernel_size,
694
+ float(p_dropout),
695
+ )
696
+ self.dec = GeneratorNSF(
697
+ inter_channels,
698
+ resblock,
699
+ resblock_kernel_sizes,
700
+ resblock_dilation_sizes,
701
+ upsample_rates,
702
+ upsample_initial_channel,
703
+ upsample_kernel_sizes,
704
+ gin_channels=gin_channels,
705
+ sr=sr,
706
+ is_half=kwargs["is_half"],
707
+ )
708
+ self.enc_q = PosteriorEncoder(
709
+ spec_channels,
710
+ inter_channels,
711
+ hidden_channels,
712
+ 5,
713
+ 1,
714
+ 16,
715
+ gin_channels=gin_channels,
716
+ )
717
+ self.flow = ResidualCouplingBlock(
718
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
719
+ )
720
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
721
+ logger.debug(
722
+ "gin_channels: "
723
+ + str(gin_channels)
724
+ + ", self.spk_embed_dim: "
725
+ + str(self.spk_embed_dim)
726
+ )
727
+
728
+ def remove_weight_norm(self):
729
+ self.dec.remove_weight_norm()
730
+ self.flow.remove_weight_norm()
731
+ self.enc_q.remove_weight_norm()
732
+
733
+ def __prepare_scriptable__(self):
734
+ for hook in self.dec._forward_pre_hooks.values():
735
+ # The hook we want to remove is an instance of WeightNorm class, so
736
+ # normally we would do `if isinstance(...)` but this class is not accessible
737
+ # because of shadowing, so we check the module name directly.
738
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
739
+ if (
740
+ hook.__module__ == "torch.nn.utils.weight_norm"
741
+ and hook.__class__.__name__ == "WeightNorm"
742
+ ):
743
+ torch.nn.utils.remove_weight_norm(self.dec)
744
+ for hook in self.flow._forward_pre_hooks.values():
745
+ if (
746
+ hook.__module__ == "torch.nn.utils.weight_norm"
747
+ and hook.__class__.__name__ == "WeightNorm"
748
+ ):
749
+ torch.nn.utils.remove_weight_norm(self.flow)
750
+ if hasattr(self, "enc_q"):
751
+ for hook in self.enc_q._forward_pre_hooks.values():
752
+ if (
753
+ hook.__module__ == "torch.nn.utils.weight_norm"
754
+ and hook.__class__.__name__ == "WeightNorm"
755
+ ):
756
+ torch.nn.utils.remove_weight_norm(self.enc_q)
757
+ return self
758
+
759
+ @torch.jit.ignore
760
+ def forward(
761
+ self,
762
+ phone: torch.Tensor,
763
+ phone_lengths: torch.Tensor,
764
+ pitch: torch.Tensor,
765
+ pitchf: torch.Tensor,
766
+ y: torch.Tensor,
767
+ y_lengths: torch.Tensor,
768
+ ds: Optional[torch.Tensor] = None,
769
+ ): # 这里ds是id,[bs,1]
770
+ # print(1,pitch.shape)#[bs,t]
771
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
772
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
773
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
774
+ z_p = self.flow(z, y_mask, g=g)
775
+ z_slice, ids_slice = commons.rand_slice_segments(
776
+ z, y_lengths, self.segment_size
777
+ )
778
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
779
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
780
+ # print(-2,pitchf.shape,z_slice.shape)
781
+ o = self.dec(z_slice, pitchf, g=g)
782
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
783
+
784
+ @torch.jit.export
785
+ def infer(
786
+ self,
787
+ phone: torch.Tensor,
788
+ phone_lengths: torch.Tensor,
789
+ pitch: torch.Tensor,
790
+ nsff0: torch.Tensor,
791
+ sid: torch.Tensor,
792
+ rate: Optional[torch.Tensor] = None,
793
+ ):
794
+ g = self.emb_g(sid).unsqueeze(-1)
795
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
796
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
797
+ if rate is not None:
798
+ assert isinstance(rate, torch.Tensor)
799
+ head = int(z_p.shape[2] * (1 - rate.item()))
800
+ z_p = z_p[:, :, head:]
801
+ x_mask = x_mask[:, :, head:]
802
+ nsff0 = nsff0[:, head:]
803
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
804
+ o = self.dec(z * x_mask, nsff0, g=g)
805
+ return o, x_mask, (z, z_p, m_p, logs_p)
806
+
807
+
808
+ class SynthesizerTrnMs768NSFsid(nn.Module):
809
+ def __init__(
810
+ self,
811
+ spec_channels,
812
+ segment_size,
813
+ inter_channels,
814
+ hidden_channels,
815
+ filter_channels,
816
+ n_heads,
817
+ n_layers,
818
+ kernel_size,
819
+ p_dropout,
820
+ resblock,
821
+ resblock_kernel_sizes,
822
+ resblock_dilation_sizes,
823
+ upsample_rates,
824
+ upsample_initial_channel,
825
+ upsample_kernel_sizes,
826
+ spk_embed_dim,
827
+ gin_channels,
828
+ sr,
829
+ **kwargs
830
+ ):
831
+ super(SynthesizerTrnMs768NSFsid, self).__init__()
832
+ if isinstance(sr, str):
833
+ sr = sr2sr[sr]
834
+ self.spec_channels = spec_channels
835
+ self.inter_channels = inter_channels
836
+ self.hidden_channels = hidden_channels
837
+ self.filter_channels = filter_channels
838
+ self.n_heads = n_heads
839
+ self.n_layers = n_layers
840
+ self.kernel_size = kernel_size
841
+ self.p_dropout = float(p_dropout)
842
+ self.resblock = resblock
843
+ self.resblock_kernel_sizes = resblock_kernel_sizes
844
+ self.resblock_dilation_sizes = resblock_dilation_sizes
845
+ self.upsample_rates = upsample_rates
846
+ self.upsample_initial_channel = upsample_initial_channel
847
+ self.upsample_kernel_sizes = upsample_kernel_sizes
848
+ self.segment_size = segment_size
849
+ self.gin_channels = gin_channels
850
+ # self.hop_length = hop_length#
851
+ self.spk_embed_dim = spk_embed_dim
852
+ self.enc_p = TextEncoder768(
853
+ inter_channels,
854
+ hidden_channels,
855
+ filter_channels,
856
+ n_heads,
857
+ n_layers,
858
+ kernel_size,
859
+ float(p_dropout),
860
+ )
861
+ self.dec = GeneratorNSF(
862
+ inter_channels,
863
+ resblock,
864
+ resblock_kernel_sizes,
865
+ resblock_dilation_sizes,
866
+ upsample_rates,
867
+ upsample_initial_channel,
868
+ upsample_kernel_sizes,
869
+ gin_channels=gin_channels,
870
+ sr=sr,
871
+ is_half=kwargs["is_half"],
872
+ )
873
+ self.enc_q = PosteriorEncoder(
874
+ spec_channels,
875
+ inter_channels,
876
+ hidden_channels,
877
+ 5,
878
+ 1,
879
+ 16,
880
+ gin_channels=gin_channels,
881
+ )
882
+ self.flow = ResidualCouplingBlock(
883
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
884
+ )
885
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
886
+ logger.debug(
887
+ "gin_channels: "
888
+ + str(gin_channels)
889
+ + ", self.spk_embed_dim: "
890
+ + str(self.spk_embed_dim)
891
+ )
892
+
893
+ def remove_weight_norm(self):
894
+ self.dec.remove_weight_norm()
895
+ self.flow.remove_weight_norm()
896
+ self.enc_q.remove_weight_norm()
897
+
898
+ def __prepare_scriptable__(self):
899
+ for hook in self.dec._forward_pre_hooks.values():
900
+ # The hook we want to remove is an instance of WeightNorm class, so
901
+ # normally we would do `if isinstance(...)` but this class is not accessible
902
+ # because of shadowing, so we check the module name directly.
903
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
904
+ if (
905
+ hook.__module__ == "torch.nn.utils.weight_norm"
906
+ and hook.__class__.__name__ == "WeightNorm"
907
+ ):
908
+ torch.nn.utils.remove_weight_norm(self.dec)
909
+ for hook in self.flow._forward_pre_hooks.values():
910
+ if (
911
+ hook.__module__ == "torch.nn.utils.weight_norm"
912
+ and hook.__class__.__name__ == "WeightNorm"
913
+ ):
914
+ torch.nn.utils.remove_weight_norm(self.flow)
915
+ if hasattr(self, "enc_q"):
916
+ for hook in self.enc_q._forward_pre_hooks.values():
917
+ if (
918
+ hook.__module__ == "torch.nn.utils.weight_norm"
919
+ and hook.__class__.__name__ == "WeightNorm"
920
+ ):
921
+ torch.nn.utils.remove_weight_norm(self.enc_q)
922
+ return self
923
+
924
+ @torch.jit.ignore
925
+ def forward(
926
+ self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
927
+ ): # 这里ds是id,[bs,1]
928
+ # print(1,pitch.shape)#[bs,t]
929
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
930
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
931
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
932
+ z_p = self.flow(z, y_mask, g=g)
933
+ z_slice, ids_slice = commons.rand_slice_segments(
934
+ z, y_lengths, self.segment_size
935
+ )
936
+ # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
937
+ pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
938
+ # print(-2,pitchf.shape,z_slice.shape)
939
+ o = self.dec(z_slice, pitchf, g=g)
940
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
941
+
942
+ @torch.jit.export
943
+ def infer(
944
+ self,
945
+ phone: torch.Tensor,
946
+ phone_lengths: torch.Tensor,
947
+ pitch: torch.Tensor,
948
+ nsff0: torch.Tensor,
949
+ sid: torch.Tensor,
950
+ rate: Optional[torch.Tensor] = None,
951
+ ):
952
+ g = self.emb_g(sid).unsqueeze(-1)
953
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
954
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
955
+ if rate is not None:
956
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
957
+ z_p = z_p[:, :, head:]
958
+ x_mask = x_mask[:, :, head:]
959
+ nsff0 = nsff0[:, head:]
960
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
961
+ o = self.dec(z * x_mask, nsff0, g=g)
962
+ return o, x_mask, (z, z_p, m_p, logs_p)
963
+
964
+
965
+ class SynthesizerTrnMs256NSFsid_nono(nn.Module):
966
+ def __init__(
967
+ self,
968
+ spec_channels,
969
+ segment_size,
970
+ inter_channels,
971
+ hidden_channels,
972
+ filter_channels,
973
+ n_heads,
974
+ n_layers,
975
+ kernel_size,
976
+ p_dropout,
977
+ resblock,
978
+ resblock_kernel_sizes,
979
+ resblock_dilation_sizes,
980
+ upsample_rates,
981
+ upsample_initial_channel,
982
+ upsample_kernel_sizes,
983
+ spk_embed_dim,
984
+ gin_channels,
985
+ sr=None,
986
+ **kwargs
987
+ ):
988
+ super(SynthesizerTrnMs256NSFsid_nono, self).__init__()
989
+ self.spec_channels = spec_channels
990
+ self.inter_channels = inter_channels
991
+ self.hidden_channels = hidden_channels
992
+ self.filter_channels = filter_channels
993
+ self.n_heads = n_heads
994
+ self.n_layers = n_layers
995
+ self.kernel_size = kernel_size
996
+ self.p_dropout = float(p_dropout)
997
+ self.resblock = resblock
998
+ self.resblock_kernel_sizes = resblock_kernel_sizes
999
+ self.resblock_dilation_sizes = resblock_dilation_sizes
1000
+ self.upsample_rates = upsample_rates
1001
+ self.upsample_initial_channel = upsample_initial_channel
1002
+ self.upsample_kernel_sizes = upsample_kernel_sizes
1003
+ self.segment_size = segment_size
1004
+ self.gin_channels = gin_channels
1005
+ # self.hop_length = hop_length#
1006
+ self.spk_embed_dim = spk_embed_dim
1007
+ self.enc_p = TextEncoder256(
1008
+ inter_channels,
1009
+ hidden_channels,
1010
+ filter_channels,
1011
+ n_heads,
1012
+ n_layers,
1013
+ kernel_size,
1014
+ float(p_dropout),
1015
+ f0=False,
1016
+ )
1017
+ self.dec = Generator(
1018
+ inter_channels,
1019
+ resblock,
1020
+ resblock_kernel_sizes,
1021
+ resblock_dilation_sizes,
1022
+ upsample_rates,
1023
+ upsample_initial_channel,
1024
+ upsample_kernel_sizes,
1025
+ gin_channels=gin_channels,
1026
+ )
1027
+ self.enc_q = PosteriorEncoder(
1028
+ spec_channels,
1029
+ inter_channels,
1030
+ hidden_channels,
1031
+ 5,
1032
+ 1,
1033
+ 16,
1034
+ gin_channels=gin_channels,
1035
+ )
1036
+ self.flow = ResidualCouplingBlock(
1037
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
1038
+ )
1039
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
1040
+ logger.debug(
1041
+ "gin_channels: "
1042
+ + str(gin_channels)
1043
+ + ", self.spk_embed_dim: "
1044
+ + str(self.spk_embed_dim)
1045
+ )
1046
+
1047
+ def remove_weight_norm(self):
1048
+ self.dec.remove_weight_norm()
1049
+ self.flow.remove_weight_norm()
1050
+ self.enc_q.remove_weight_norm()
1051
+
1052
+ def __prepare_scriptable__(self):
1053
+ for hook in self.dec._forward_pre_hooks.values():
1054
+ # The hook we want to remove is an instance of WeightNorm class, so
1055
+ # normally we would do `if isinstance(...)` but this class is not accessible
1056
+ # because of shadowing, so we check the module name directly.
1057
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
1058
+ if (
1059
+ hook.__module__ == "torch.nn.utils.weight_norm"
1060
+ and hook.__class__.__name__ == "WeightNorm"
1061
+ ):
1062
+ torch.nn.utils.remove_weight_norm(self.dec)
1063
+ for hook in self.flow._forward_pre_hooks.values():
1064
+ if (
1065
+ hook.__module__ == "torch.nn.utils.weight_norm"
1066
+ and hook.__class__.__name__ == "WeightNorm"
1067
+ ):
1068
+ torch.nn.utils.remove_weight_norm(self.flow)
1069
+ if hasattr(self, "enc_q"):
1070
+ for hook in self.enc_q._forward_pre_hooks.values():
1071
+ if (
1072
+ hook.__module__ == "torch.nn.utils.weight_norm"
1073
+ and hook.__class__.__name__ == "WeightNorm"
1074
+ ):
1075
+ torch.nn.utils.remove_weight_norm(self.enc_q)
1076
+ return self
1077
+
1078
+ @torch.jit.ignore
1079
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
1080
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
1081
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1082
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
1083
+ z_p = self.flow(z, y_mask, g=g)
1084
+ z_slice, ids_slice = commons.rand_slice_segments(
1085
+ z, y_lengths, self.segment_size
1086
+ )
1087
+ o = self.dec(z_slice, g=g)
1088
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
1089
+
1090
+ @torch.jit.export
1091
+ def infer(
1092
+ self,
1093
+ phone: torch.Tensor,
1094
+ phone_lengths: torch.Tensor,
1095
+ sid: torch.Tensor,
1096
+ rate: Optional[torch.Tensor] = None,
1097
+ ):
1098
+ g = self.emb_g(sid).unsqueeze(-1)
1099
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1100
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
1101
+ if rate is not None:
1102
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
1103
+ z_p = z_p[:, :, head:]
1104
+ x_mask = x_mask[:, :, head:]
1105
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
1106
+ o = self.dec(z * x_mask, g=g)
1107
+ return o, x_mask, (z, z_p, m_p, logs_p)
1108
+
1109
+
1110
+ class SynthesizerTrnMs768NSFsid_nono(nn.Module):
1111
+ def __init__(
1112
+ self,
1113
+ spec_channels,
1114
+ segment_size,
1115
+ inter_channels,
1116
+ hidden_channels,
1117
+ filter_channels,
1118
+ n_heads,
1119
+ n_layers,
1120
+ kernel_size,
1121
+ p_dropout,
1122
+ resblock,
1123
+ resblock_kernel_sizes,
1124
+ resblock_dilation_sizes,
1125
+ upsample_rates,
1126
+ upsample_initial_channel,
1127
+ upsample_kernel_sizes,
1128
+ spk_embed_dim,
1129
+ gin_channels,
1130
+ sr=None,
1131
+ **kwargs
1132
+ ):
1133
+ super(SynthesizerTrnMs768NSFsid_nono, self).__init__()
1134
+ self.spec_channels = spec_channels
1135
+ self.inter_channels = inter_channels
1136
+ self.hidden_channels = hidden_channels
1137
+ self.filter_channels = filter_channels
1138
+ self.n_heads = n_heads
1139
+ self.n_layers = n_layers
1140
+ self.kernel_size = kernel_size
1141
+ self.p_dropout = float(p_dropout)
1142
+ self.resblock = resblock
1143
+ self.resblock_kernel_sizes = resblock_kernel_sizes
1144
+ self.resblock_dilation_sizes = resblock_dilation_sizes
1145
+ self.upsample_rates = upsample_rates
1146
+ self.upsample_initial_channel = upsample_initial_channel
1147
+ self.upsample_kernel_sizes = upsample_kernel_sizes
1148
+ self.segment_size = segment_size
1149
+ self.gin_channels = gin_channels
1150
+ # self.hop_length = hop_length#
1151
+ self.spk_embed_dim = spk_embed_dim
1152
+ self.enc_p = TextEncoder768(
1153
+ inter_channels,
1154
+ hidden_channels,
1155
+ filter_channels,
1156
+ n_heads,
1157
+ n_layers,
1158
+ kernel_size,
1159
+ float(p_dropout),
1160
+ f0=False,
1161
+ )
1162
+ self.dec = Generator(
1163
+ inter_channels,
1164
+ resblock,
1165
+ resblock_kernel_sizes,
1166
+ resblock_dilation_sizes,
1167
+ upsample_rates,
1168
+ upsample_initial_channel,
1169
+ upsample_kernel_sizes,
1170
+ gin_channels=gin_channels,
1171
+ )
1172
+ self.enc_q = PosteriorEncoder(
1173
+ spec_channels,
1174
+ inter_channels,
1175
+ hidden_channels,
1176
+ 5,
1177
+ 1,
1178
+ 16,
1179
+ gin_channels=gin_channels,
1180
+ )
1181
+ self.flow = ResidualCouplingBlock(
1182
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
1183
+ )
1184
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
1185
+ logger.debug(
1186
+ "gin_channels: "
1187
+ + str(gin_channels)
1188
+ + ", self.spk_embed_dim: "
1189
+ + str(self.spk_embed_dim)
1190
+ )
1191
+
1192
+ def remove_weight_norm(self):
1193
+ self.dec.remove_weight_norm()
1194
+ self.flow.remove_weight_norm()
1195
+ self.enc_q.remove_weight_norm()
1196
+
1197
+ def __prepare_scriptable__(self):
1198
+ for hook in self.dec._forward_pre_hooks.values():
1199
+ # The hook we want to remove is an instance of WeightNorm class, so
1200
+ # normally we would do `if isinstance(...)` but this class is not accessible
1201
+ # because of shadowing, so we check the module name directly.
1202
+ # https://github.com/pytorch/pytorch/blob/be0ca00c5ce260eb5bcec3237357f7a30cc08983/torch/nn/utils/__init__.py#L3
1203
+ if (
1204
+ hook.__module__ == "torch.nn.utils.weight_norm"
1205
+ and hook.__class__.__name__ == "WeightNorm"
1206
+ ):
1207
+ torch.nn.utils.remove_weight_norm(self.dec)
1208
+ for hook in self.flow._forward_pre_hooks.values():
1209
+ if (
1210
+ hook.__module__ == "torch.nn.utils.weight_norm"
1211
+ and hook.__class__.__name__ == "WeightNorm"
1212
+ ):
1213
+ torch.nn.utils.remove_weight_norm(self.flow)
1214
+ if hasattr(self, "enc_q"):
1215
+ for hook in self.enc_q._forward_pre_hooks.values():
1216
+ if (
1217
+ hook.__module__ == "torch.nn.utils.weight_norm"
1218
+ and hook.__class__.__name__ == "WeightNorm"
1219
+ ):
1220
+ torch.nn.utils.remove_weight_norm(self.enc_q)
1221
+ return self
1222
+
1223
+ @torch.jit.ignore
1224
+ def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
1225
+ g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
1226
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1227
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
1228
+ z_p = self.flow(z, y_mask, g=g)
1229
+ z_slice, ids_slice = commons.rand_slice_segments(
1230
+ z, y_lengths, self.segment_size
1231
+ )
1232
+ o = self.dec(z_slice, g=g)
1233
+ return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
1234
+
1235
+ @torch.jit.export
1236
+ def infer(
1237
+ self,
1238
+ phone: torch.Tensor,
1239
+ phone_lengths: torch.Tensor,
1240
+ sid: torch.Tensor,
1241
+ rate: Optional[torch.Tensor] = None,
1242
+ ):
1243
+ g = self.emb_g(sid).unsqueeze(-1)
1244
+ m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
1245
+ z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
1246
+ if rate is not None:
1247
+ head = int(z_p.shape[2] * (1.0 - rate.item()))
1248
+ z_p = z_p[:, :, head:]
1249
+ x_mask = x_mask[:, :, head:]
1250
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
1251
+ o = self.dec(z * x_mask, g=g)
1252
+ return o, x_mask, (z, z_p, m_p, logs_p)
1253
+
1254
+
1255
+ class MultiPeriodDiscriminator(torch.nn.Module):
1256
+ def __init__(self, use_spectral_norm=False):
1257
+ super(MultiPeriodDiscriminator, self).__init__()
1258
+ periods = [2, 3, 5, 7, 11, 17]
1259
+ # periods = [3, 5, 7, 11, 17, 23, 37]
1260
+
1261
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1262
+ discs = discs + [
1263
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1264
+ ]
1265
+ self.discriminators = nn.ModuleList(discs)
1266
+
1267
+ def forward(self, y, y_hat):
1268
+ y_d_rs = [] #
1269
+ y_d_gs = []
1270
+ fmap_rs = []
1271
+ fmap_gs = []
1272
+ for i, d in enumerate(self.discriminators):
1273
+ y_d_r, fmap_r = d(y)
1274
+ y_d_g, fmap_g = d(y_hat)
1275
+ # for j in range(len(fmap_r)):
1276
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1277
+ y_d_rs.append(y_d_r)
1278
+ y_d_gs.append(y_d_g)
1279
+ fmap_rs.append(fmap_r)
1280
+ fmap_gs.append(fmap_g)
1281
+
1282
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1283
+
1284
+
1285
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
1286
+ def __init__(self, use_spectral_norm=False):
1287
+ super(MultiPeriodDiscriminatorV2, self).__init__()
1288
+ # periods = [2, 3, 5, 7, 11, 17]
1289
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
1290
+
1291
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1292
+ discs = discs + [
1293
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1294
+ ]
1295
+ self.discriminators = nn.ModuleList(discs)
1296
+
1297
+ def forward(self, y, y_hat):
1298
+ y_d_rs = [] #
1299
+ y_d_gs = []
1300
+ fmap_rs = []
1301
+ fmap_gs = []
1302
+ for i, d in enumerate(self.discriminators):
1303
+ y_d_r, fmap_r = d(y)
1304
+ y_d_g, fmap_g = d(y_hat)
1305
+ # for j in range(len(fmap_r)):
1306
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1307
+ y_d_rs.append(y_d_r)
1308
+ y_d_gs.append(y_d_g)
1309
+ fmap_rs.append(fmap_r)
1310
+ fmap_gs.append(fmap_g)
1311
+
1312
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1313
+
1314
+
1315
+ class DiscriminatorS(torch.nn.Module):
1316
+ def __init__(self, use_spectral_norm=False):
1317
+ super(DiscriminatorS, self).__init__()
1318
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1319
+ self.convs = nn.ModuleList(
1320
+ [
1321
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1322
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1323
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1324
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1325
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1326
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1327
+ ]
1328
+ )
1329
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1330
+
1331
+ def forward(self, x):
1332
+ fmap = []
1333
+
1334
+ for l in self.convs:
1335
+ x = l(x)
1336
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1337
+ fmap.append(x)
1338
+ x = self.conv_post(x)
1339
+ fmap.append(x)
1340
+ x = torch.flatten(x, 1, -1)
1341
+
1342
+ return x, fmap
1343
+
1344
+
1345
+ class DiscriminatorP(torch.nn.Module):
1346
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1347
+ super(DiscriminatorP, self).__init__()
1348
+ self.period = period
1349
+ self.use_spectral_norm = use_spectral_norm
1350
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1351
+ self.convs = nn.ModuleList(
1352
+ [
1353
+ norm_f(
1354
+ Conv2d(
1355
+ 1,
1356
+ 32,
1357
+ (kernel_size, 1),
1358
+ (stride, 1),
1359
+ padding=(get_padding(kernel_size, 1), 0),
1360
+ )
1361
+ ),
1362
+ norm_f(
1363
+ Conv2d(
1364
+ 32,
1365
+ 128,
1366
+ (kernel_size, 1),
1367
+ (stride, 1),
1368
+ padding=(get_padding(kernel_size, 1), 0),
1369
+ )
1370
+ ),
1371
+ norm_f(
1372
+ Conv2d(
1373
+ 128,
1374
+ 512,
1375
+ (kernel_size, 1),
1376
+ (stride, 1),
1377
+ padding=(get_padding(kernel_size, 1), 0),
1378
+ )
1379
+ ),
1380
+ norm_f(
1381
+ Conv2d(
1382
+ 512,
1383
+ 1024,
1384
+ (kernel_size, 1),
1385
+ (stride, 1),
1386
+ padding=(get_padding(kernel_size, 1), 0),
1387
+ )
1388
+ ),
1389
+ norm_f(
1390
+ Conv2d(
1391
+ 1024,
1392
+ 1024,
1393
+ (kernel_size, 1),
1394
+ 1,
1395
+ padding=(get_padding(kernel_size, 1), 0),
1396
+ )
1397
+ ),
1398
+ ]
1399
+ )
1400
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1401
+
1402
+ def forward(self, x):
1403
+ fmap = []
1404
+
1405
+ # 1d to 2d
1406
+ b, c, t = x.shape
1407
+ if t % self.period != 0: # pad first
1408
+ n_pad = self.period - (t % self.period)
1409
+ if has_xpu and x.dtype == torch.bfloat16:
1410
+ x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to(
1411
+ dtype=torch.bfloat16
1412
+ )
1413
+ else:
1414
+ x = F.pad(x, (0, n_pad), "reflect")
1415
+ t = t + n_pad
1416
+ x = x.view(b, c, t // self.period, self.period)
1417
+
1418
+ for l in self.convs:
1419
+ x = l(x)
1420
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
1421
+ fmap.append(x)
1422
+ x = self.conv_post(x)
1423
+ fmap.append(x)
1424
+ x = torch.flatten(x, 1, -1)
1425
+
1426
+ return x, fmap
rvc/lib/infer_pack/models_onnx.py ADDED
@@ -0,0 +1,821 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import math
3
+
4
+ logger = logging.getLogger(__name__)
5
+
6
+ import numpy as np
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
10
+ from torch.nn import functional as F
11
+ from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
12
+
13
+ from rvc.lib.infer_pack import attentions, commons, modules
14
+ from rvc.lib.infer_pack.commons import get_padding, init_weights
15
+
16
+
17
+ class TextEncoder256(nn.Module):
18
+ def __init__(
19
+ self,
20
+ out_channels,
21
+ hidden_channels,
22
+ filter_channels,
23
+ n_heads,
24
+ n_layers,
25
+ kernel_size,
26
+ p_dropout,
27
+ f0=True,
28
+ ):
29
+ super().__init__()
30
+ self.out_channels = out_channels
31
+ self.hidden_channels = hidden_channels
32
+ self.filter_channels = filter_channels
33
+ self.n_heads = n_heads
34
+ self.n_layers = n_layers
35
+ self.kernel_size = kernel_size
36
+ self.p_dropout = p_dropout
37
+ self.emb_phone = nn.Linear(256, hidden_channels)
38
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
+ if f0 == True:
40
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
+ self.encoder = attentions.Encoder(
42
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
+ )
44
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
+
46
+ def forward(self, phone, pitch, lengths):
47
+ if pitch == None:
48
+ x = self.emb_phone(phone)
49
+ else:
50
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
+ x = self.lrelu(x)
53
+ x = torch.transpose(x, 1, -1) # [b, h, t]
54
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
+ x.dtype
56
+ )
57
+ x = self.encoder(x * x_mask, x_mask)
58
+ stats = self.proj(x) * x_mask
59
+
60
+ m, logs = torch.split(stats, self.out_channels, dim=1)
61
+ return m, logs, x_mask
62
+
63
+
64
+ class TextEncoder768(nn.Module):
65
+ def __init__(
66
+ self,
67
+ out_channels,
68
+ hidden_channels,
69
+ filter_channels,
70
+ n_heads,
71
+ n_layers,
72
+ kernel_size,
73
+ p_dropout,
74
+ f0=True,
75
+ ):
76
+ super().__init__()
77
+ self.out_channels = out_channels
78
+ self.hidden_channels = hidden_channels
79
+ self.filter_channels = filter_channels
80
+ self.n_heads = n_heads
81
+ self.n_layers = n_layers
82
+ self.kernel_size = kernel_size
83
+ self.p_dropout = p_dropout
84
+ self.emb_phone = nn.Linear(768, hidden_channels)
85
+ self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
+ if f0 == True:
87
+ self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
+ self.encoder = attentions.Encoder(
89
+ hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
+ )
91
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
+
93
+ def forward(self, phone, pitch, lengths):
94
+ if pitch == None:
95
+ x = self.emb_phone(phone)
96
+ else:
97
+ x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
+ x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
+ x = self.lrelu(x)
100
+ x = torch.transpose(x, 1, -1) # [b, h, t]
101
+ x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
+ x.dtype
103
+ )
104
+ x = self.encoder(x * x_mask, x_mask)
105
+ stats = self.proj(x) * x_mask
106
+
107
+ m, logs = torch.split(stats, self.out_channels, dim=1)
108
+ return m, logs, x_mask
109
+
110
+
111
+ class ResidualCouplingBlock(nn.Module):
112
+ def __init__(
113
+ self,
114
+ channels,
115
+ hidden_channels,
116
+ kernel_size,
117
+ dilation_rate,
118
+ n_layers,
119
+ n_flows=4,
120
+ gin_channels=0,
121
+ ):
122
+ super().__init__()
123
+ self.channels = channels
124
+ self.hidden_channels = hidden_channels
125
+ self.kernel_size = kernel_size
126
+ self.dilation_rate = dilation_rate
127
+ self.n_layers = n_layers
128
+ self.n_flows = n_flows
129
+ self.gin_channels = gin_channels
130
+
131
+ self.flows = nn.ModuleList()
132
+ for i in range(n_flows):
133
+ self.flows.append(
134
+ modules.ResidualCouplingLayer(
135
+ channels,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=gin_channels,
141
+ mean_only=True,
142
+ )
143
+ )
144
+ self.flows.append(modules.Flip())
145
+
146
+ def forward(self, x, x_mask, g=None, reverse=False):
147
+ if not reverse:
148
+ for flow in self.flows:
149
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
+ else:
151
+ for flow in reversed(self.flows):
152
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
153
+ return x
154
+
155
+ def remove_weight_norm(self):
156
+ for i in range(self.n_flows):
157
+ self.flows[i * 2].remove_weight_norm()
158
+
159
+
160
+ class PosteriorEncoder(nn.Module):
161
+ def __init__(
162
+ self,
163
+ in_channels,
164
+ out_channels,
165
+ hidden_channels,
166
+ kernel_size,
167
+ dilation_rate,
168
+ n_layers,
169
+ gin_channels=0,
170
+ ):
171
+ super().__init__()
172
+ self.in_channels = in_channels
173
+ self.out_channels = out_channels
174
+ self.hidden_channels = hidden_channels
175
+ self.kernel_size = kernel_size
176
+ self.dilation_rate = dilation_rate
177
+ self.n_layers = n_layers
178
+ self.gin_channels = gin_channels
179
+
180
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
+ self.enc = modules.WN(
182
+ hidden_channels,
183
+ kernel_size,
184
+ dilation_rate,
185
+ n_layers,
186
+ gin_channels=gin_channels,
187
+ )
188
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
+
190
+ def forward(self, x, x_lengths, g=None):
191
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
+ x.dtype
193
+ )
194
+ x = self.pre(x) * x_mask
195
+ x = self.enc(x, x_mask, g=g)
196
+ stats = self.proj(x) * x_mask
197
+ m, logs = torch.split(stats, self.out_channels, dim=1)
198
+ z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
+ return z, m, logs, x_mask
200
+
201
+ def remove_weight_norm(self):
202
+ self.enc.remove_weight_norm()
203
+
204
+
205
+ class Generator(torch.nn.Module):
206
+ def __init__(
207
+ self,
208
+ initial_channel,
209
+ resblock,
210
+ resblock_kernel_sizes,
211
+ resblock_dilation_sizes,
212
+ upsample_rates,
213
+ upsample_initial_channel,
214
+ upsample_kernel_sizes,
215
+ gin_channels=0,
216
+ ):
217
+ super(Generator, self).__init__()
218
+ self.num_kernels = len(resblock_kernel_sizes)
219
+ self.num_upsamples = len(upsample_rates)
220
+ self.conv_pre = Conv1d(
221
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
222
+ )
223
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
+
225
+ self.ups = nn.ModuleList()
226
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
+ self.ups.append(
228
+ weight_norm(
229
+ ConvTranspose1d(
230
+ upsample_initial_channel // (2**i),
231
+ upsample_initial_channel // (2 ** (i + 1)),
232
+ k,
233
+ u,
234
+ padding=(k - u) // 2,
235
+ )
236
+ )
237
+ )
238
+
239
+ self.resblocks = nn.ModuleList()
240
+ for i in range(len(self.ups)):
241
+ ch = upsample_initial_channel // (2 ** (i + 1))
242
+ for j, (k, d) in enumerate(
243
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
+ ):
245
+ self.resblocks.append(resblock(ch, k, d))
246
+
247
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
+ self.ups.apply(init_weights)
249
+
250
+ if gin_channels != 0:
251
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
+
253
+ def forward(self, x, g=None):
254
+ x = self.conv_pre(x)
255
+ if g is not None:
256
+ x = x + self.cond(g)
257
+
258
+ for i in range(self.num_upsamples):
259
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
+ x = self.ups[i](x)
261
+ xs = None
262
+ for j in range(self.num_kernels):
263
+ if xs is None:
264
+ xs = self.resblocks[i * self.num_kernels + j](x)
265
+ else:
266
+ xs += self.resblocks[i * self.num_kernels + j](x)
267
+ x = xs / self.num_kernels
268
+ x = F.leaky_relu(x)
269
+ x = self.conv_post(x)
270
+ x = torch.tanh(x)
271
+
272
+ return x
273
+
274
+ def remove_weight_norm(self):
275
+ for l in self.ups:
276
+ remove_weight_norm(l)
277
+ for l in self.resblocks:
278
+ l.remove_weight_norm()
279
+
280
+
281
+ class SineGen(torch.nn.Module):
282
+ """Definition of sine generator
283
+ SineGen(samp_rate, harmonic_num = 0,
284
+ sine_amp = 0.1, noise_std = 0.003,
285
+ voiced_threshold = 0,
286
+ flag_for_pulse=False)
287
+ samp_rate: sampling rate in Hz
288
+ harmonic_num: number of harmonic overtones (default 0)
289
+ sine_amp: amplitude of sine-wavefrom (default 0.1)
290
+ noise_std: std of Gaussian noise (default 0.003)
291
+ voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
+ flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
+ Note: when flag_for_pulse is True, the first time step of a voiced
294
+ segment is always sin(np.pi) or cos(0)
295
+ """
296
+
297
+ def __init__(
298
+ self,
299
+ samp_rate,
300
+ harmonic_num=0,
301
+ sine_amp=0.1,
302
+ noise_std=0.003,
303
+ voiced_threshold=0,
304
+ flag_for_pulse=False,
305
+ ):
306
+ super(SineGen, self).__init__()
307
+ self.sine_amp = sine_amp
308
+ self.noise_std = noise_std
309
+ self.harmonic_num = harmonic_num
310
+ self.dim = self.harmonic_num + 1
311
+ self.sampling_rate = samp_rate
312
+ self.voiced_threshold = voiced_threshold
313
+
314
+ def _f02uv(self, f0):
315
+ # generate uv signal
316
+ uv = torch.ones_like(f0)
317
+ uv = uv * (f0 > self.voiced_threshold)
318
+ return uv
319
+
320
+ def forward(self, f0, upp):
321
+ """sine_tensor, uv = forward(f0)
322
+ input F0: tensor(batchsize=1, length, dim=1)
323
+ f0 for unvoiced steps should be 0
324
+ output sine_tensor: tensor(batchsize=1, length, dim)
325
+ output uv: tensor(batchsize=1, length, 1)
326
+ """
327
+ with torch.no_grad():
328
+ f0 = f0[:, None].transpose(1, 2)
329
+ f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
330
+ # fundamental component
331
+ f0_buf[:, :, 0] = f0[:, :, 0]
332
+ for idx in np.arange(self.harmonic_num):
333
+ f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
334
+ idx + 2
335
+ ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
336
+ rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
337
+ rand_ini = torch.rand(
338
+ f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
339
+ )
340
+ rand_ini[:, 0] = 0
341
+ rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
342
+ tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
343
+ tmp_over_one *= upp
344
+ tmp_over_one = F.interpolate(
345
+ tmp_over_one.transpose(2, 1),
346
+ scale_factor=upp,
347
+ mode="linear",
348
+ align_corners=True,
349
+ ).transpose(2, 1)
350
+ rad_values = F.interpolate(
351
+ rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
352
+ ).transpose(
353
+ 2, 1
354
+ ) #######
355
+ tmp_over_one %= 1
356
+ tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
357
+ cumsum_shift = torch.zeros_like(rad_values)
358
+ cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
359
+ sine_waves = torch.sin(
360
+ torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
361
+ )
362
+ sine_waves = sine_waves * self.sine_amp
363
+ uv = self._f02uv(f0)
364
+ uv = F.interpolate(
365
+ uv.transpose(2, 1), scale_factor=upp, mode="nearest"
366
+ ).transpose(2, 1)
367
+ noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
368
+ noise = noise_amp * torch.randn_like(sine_waves)
369
+ sine_waves = sine_waves * uv + noise
370
+ return sine_waves, uv, noise
371
+
372
+
373
+ class SourceModuleHnNSF(torch.nn.Module):
374
+ """SourceModule for hn-nsf
375
+ SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
376
+ add_noise_std=0.003, voiced_threshod=0)
377
+ sampling_rate: sampling_rate in Hz
378
+ harmonic_num: number of harmonic above F0 (default: 0)
379
+ sine_amp: amplitude of sine source signal (default: 0.1)
380
+ add_noise_std: std of additive Gaussian noise (default: 0.003)
381
+ note that amplitude of noise in unvoiced is decided
382
+ by sine_amp
383
+ voiced_threshold: threhold to set U/V given F0 (default: 0)
384
+ Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
385
+ F0_sampled (batchsize, length, 1)
386
+ Sine_source (batchsize, length, 1)
387
+ noise_source (batchsize, length 1)
388
+ uv (batchsize, length, 1)
389
+ """
390
+
391
+ def __init__(
392
+ self,
393
+ sampling_rate,
394
+ harmonic_num=0,
395
+ sine_amp=0.1,
396
+ add_noise_std=0.003,
397
+ voiced_threshod=0,
398
+ is_half=True,
399
+ ):
400
+ super(SourceModuleHnNSF, self).__init__()
401
+
402
+ self.sine_amp = sine_amp
403
+ self.noise_std = add_noise_std
404
+ self.is_half = is_half
405
+ # to produce sine waveforms
406
+ self.l_sin_gen = SineGen(
407
+ sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
408
+ )
409
+
410
+ # to merge source harmonics into a single excitation
411
+ self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
412
+ self.l_tanh = torch.nn.Tanh()
413
+
414
+ def forward(self, x, upp=None):
415
+ sine_wavs, uv, _ = self.l_sin_gen(x, upp)
416
+ if self.is_half:
417
+ sine_wavs = sine_wavs.half()
418
+ sine_merge = self.l_tanh(self.l_linear(sine_wavs))
419
+ return sine_merge, None, None # noise, uv
420
+
421
+
422
+ class GeneratorNSF(torch.nn.Module):
423
+ def __init__(
424
+ self,
425
+ initial_channel,
426
+ resblock,
427
+ resblock_kernel_sizes,
428
+ resblock_dilation_sizes,
429
+ upsample_rates,
430
+ upsample_initial_channel,
431
+ upsample_kernel_sizes,
432
+ gin_channels,
433
+ sr,
434
+ is_half=False,
435
+ ):
436
+ super(GeneratorNSF, self).__init__()
437
+ self.num_kernels = len(resblock_kernel_sizes)
438
+ self.num_upsamples = len(upsample_rates)
439
+
440
+ self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
441
+ self.m_source = SourceModuleHnNSF(
442
+ sampling_rate=sr, harmonic_num=0, is_half=is_half
443
+ )
444
+ self.noise_convs = nn.ModuleList()
445
+ self.conv_pre = Conv1d(
446
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
447
+ )
448
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
449
+
450
+ self.ups = nn.ModuleList()
451
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
452
+ c_cur = upsample_initial_channel // (2 ** (i + 1))
453
+ self.ups.append(
454
+ weight_norm(
455
+ ConvTranspose1d(
456
+ upsample_initial_channel // (2**i),
457
+ upsample_initial_channel // (2 ** (i + 1)),
458
+ k,
459
+ u,
460
+ padding=(k - u) // 2,
461
+ )
462
+ )
463
+ )
464
+ if i + 1 < len(upsample_rates):
465
+ stride_f0 = np.prod(upsample_rates[i + 1 :])
466
+ self.noise_convs.append(
467
+ Conv1d(
468
+ 1,
469
+ c_cur,
470
+ kernel_size=stride_f0 * 2,
471
+ stride=stride_f0,
472
+ padding=stride_f0 // 2,
473
+ )
474
+ )
475
+ else:
476
+ self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
477
+
478
+ self.resblocks = nn.ModuleList()
479
+ for i in range(len(self.ups)):
480
+ ch = upsample_initial_channel // (2 ** (i + 1))
481
+ for j, (k, d) in enumerate(
482
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
483
+ ):
484
+ self.resblocks.append(resblock(ch, k, d))
485
+
486
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
487
+ self.ups.apply(init_weights)
488
+
489
+ if gin_channels != 0:
490
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
491
+
492
+ self.upp = np.prod(upsample_rates)
493
+
494
+ def forward(self, x, f0, g=None):
495
+ har_source, noi_source, uv = self.m_source(f0, self.upp)
496
+ har_source = har_source.transpose(1, 2)
497
+ x = self.conv_pre(x)
498
+ if g is not None:
499
+ x = x + self.cond(g)
500
+
501
+ for i in range(self.num_upsamples):
502
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
503
+ x = self.ups[i](x)
504
+ x_source = self.noise_convs[i](har_source)
505
+ x = x + x_source
506
+ xs = None
507
+ for j in range(self.num_kernels):
508
+ if xs is None:
509
+ xs = self.resblocks[i * self.num_kernels + j](x)
510
+ else:
511
+ xs += self.resblocks[i * self.num_kernels + j](x)
512
+ x = xs / self.num_kernels
513
+ x = F.leaky_relu(x)
514
+ x = self.conv_post(x)
515
+ x = torch.tanh(x)
516
+ return x
517
+
518
+ def remove_weight_norm(self):
519
+ for l in self.ups:
520
+ remove_weight_norm(l)
521
+ for l in self.resblocks:
522
+ l.remove_weight_norm()
523
+
524
+
525
+ sr2sr = {
526
+ "32k": 32000,
527
+ "40k": 40000,
528
+ "48k": 48000,
529
+ }
530
+
531
+
532
+ class SynthesizerTrnMsNSFsidM(nn.Module):
533
+ def __init__(
534
+ self,
535
+ spec_channels,
536
+ segment_size,
537
+ inter_channels,
538
+ hidden_channels,
539
+ filter_channels,
540
+ n_heads,
541
+ n_layers,
542
+ kernel_size,
543
+ p_dropout,
544
+ resblock,
545
+ resblock_kernel_sizes,
546
+ resblock_dilation_sizes,
547
+ upsample_rates,
548
+ upsample_initial_channel,
549
+ upsample_kernel_sizes,
550
+ spk_embed_dim,
551
+ gin_channels,
552
+ sr,
553
+ version,
554
+ **kwargs,
555
+ ):
556
+ super().__init__()
557
+ if type(sr) == type("strr"):
558
+ sr = sr2sr[sr]
559
+ self.spec_channels = spec_channels
560
+ self.inter_channels = inter_channels
561
+ self.hidden_channels = hidden_channels
562
+ self.filter_channels = filter_channels
563
+ self.n_heads = n_heads
564
+ self.n_layers = n_layers
565
+ self.kernel_size = kernel_size
566
+ self.p_dropout = p_dropout
567
+ self.resblock = resblock
568
+ self.resblock_kernel_sizes = resblock_kernel_sizes
569
+ self.resblock_dilation_sizes = resblock_dilation_sizes
570
+ self.upsample_rates = upsample_rates
571
+ self.upsample_initial_channel = upsample_initial_channel
572
+ self.upsample_kernel_sizes = upsample_kernel_sizes
573
+ self.segment_size = segment_size
574
+ self.gin_channels = gin_channels
575
+ # self.hop_length = hop_length#
576
+ self.spk_embed_dim = spk_embed_dim
577
+ if version == "v1":
578
+ self.enc_p = TextEncoder256(
579
+ inter_channels,
580
+ hidden_channels,
581
+ filter_channels,
582
+ n_heads,
583
+ n_layers,
584
+ kernel_size,
585
+ p_dropout,
586
+ )
587
+ else:
588
+ self.enc_p = TextEncoder768(
589
+ inter_channels,
590
+ hidden_channels,
591
+ filter_channels,
592
+ n_heads,
593
+ n_layers,
594
+ kernel_size,
595
+ p_dropout,
596
+ )
597
+ self.dec = GeneratorNSF(
598
+ inter_channels,
599
+ resblock,
600
+ resblock_kernel_sizes,
601
+ resblock_dilation_sizes,
602
+ upsample_rates,
603
+ upsample_initial_channel,
604
+ upsample_kernel_sizes,
605
+ gin_channels=gin_channels,
606
+ sr=sr,
607
+ is_half=kwargs["is_half"],
608
+ )
609
+ self.enc_q = PosteriorEncoder(
610
+ spec_channels,
611
+ inter_channels,
612
+ hidden_channels,
613
+ 5,
614
+ 1,
615
+ 16,
616
+ gin_channels=gin_channels,
617
+ )
618
+ self.flow = ResidualCouplingBlock(
619
+ inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
620
+ )
621
+ self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
622
+ self.speaker_map = None
623
+ logger.debug(
624
+ f"gin_channels: {gin_channels}, self.spk_embed_dim: {self.spk_embed_dim}"
625
+ )
626
+
627
+ def remove_weight_norm(self):
628
+ self.dec.remove_weight_norm()
629
+ self.flow.remove_weight_norm()
630
+ self.enc_q.remove_weight_norm()
631
+
632
+ def construct_spkmixmap(self, n_speaker):
633
+ self.speaker_map = torch.zeros((n_speaker, 1, 1, self.gin_channels))
634
+ for i in range(n_speaker):
635
+ self.speaker_map[i] = self.emb_g(torch.LongTensor([[i]]))
636
+ self.speaker_map = self.speaker_map.unsqueeze(0)
637
+
638
+ def forward(self, phone, phone_lengths, pitch, nsff0, g, rnd, max_len=None):
639
+ if self.speaker_map is not None: # [N, S] * [S, B, 1, H]
640
+ g = g.reshape((g.shape[0], g.shape[1], 1, 1, 1)) # [N, S, B, 1, 1]
641
+ g = g * self.speaker_map # [N, S, B, 1, H]
642
+ g = torch.sum(g, dim=1) # [N, 1, B, 1, H]
643
+ g = g.transpose(0, -1).transpose(0, -2).squeeze(0) # [B, H, N]
644
+ else:
645
+ g = g.unsqueeze(0)
646
+ g = self.emb_g(g).transpose(1, 2)
647
+
648
+ m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
649
+ z_p = (m_p + torch.exp(logs_p) * rnd) * x_mask
650
+ z = self.flow(z_p, x_mask, g=g, reverse=True)
651
+ o = self.dec((z * x_mask)[:, :, :max_len], nsff0, g=g)
652
+ return o
653
+
654
+
655
+ class MultiPeriodDiscriminator(torch.nn.Module):
656
+ def __init__(self, use_spectral_norm=False):
657
+ super(MultiPeriodDiscriminator, self).__init__()
658
+ periods = [2, 3, 5, 7, 11, 17]
659
+ # periods = [3, 5, 7, 11, 17, 23, 37]
660
+
661
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
662
+ discs = discs + [
663
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
664
+ ]
665
+ self.discriminators = nn.ModuleList(discs)
666
+
667
+ def forward(self, y, y_hat):
668
+ y_d_rs = [] #
669
+ y_d_gs = []
670
+ fmap_rs = []
671
+ fmap_gs = []
672
+ for i, d in enumerate(self.discriminators):
673
+ y_d_r, fmap_r = d(y)
674
+ y_d_g, fmap_g = d(y_hat)
675
+ # for j in range(len(fmap_r)):
676
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
677
+ y_d_rs.append(y_d_r)
678
+ y_d_gs.append(y_d_g)
679
+ fmap_rs.append(fmap_r)
680
+ fmap_gs.append(fmap_g)
681
+
682
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
683
+
684
+
685
+ class MultiPeriodDiscriminatorV2(torch.nn.Module):
686
+ def __init__(self, use_spectral_norm=False):
687
+ super(MultiPeriodDiscriminatorV2, self).__init__()
688
+ # periods = [2, 3, 5, 7, 11, 17]
689
+ periods = [2, 3, 5, 7, 11, 17, 23, 37]
690
+
691
+ discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
692
+ discs = discs + [
693
+ DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
694
+ ]
695
+ self.discriminators = nn.ModuleList(discs)
696
+
697
+ def forward(self, y, y_hat):
698
+ y_d_rs = [] #
699
+ y_d_gs = []
700
+ fmap_rs = []
701
+ fmap_gs = []
702
+ for i, d in enumerate(self.discriminators):
703
+ y_d_r, fmap_r = d(y)
704
+ y_d_g, fmap_g = d(y_hat)
705
+ # for j in range(len(fmap_r)):
706
+ # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
707
+ y_d_rs.append(y_d_r)
708
+ y_d_gs.append(y_d_g)
709
+ fmap_rs.append(fmap_r)
710
+ fmap_gs.append(fmap_g)
711
+
712
+ return y_d_rs, y_d_gs, fmap_rs, fmap_gs
713
+
714
+
715
+ class DiscriminatorS(torch.nn.Module):
716
+ def __init__(self, use_spectral_norm=False):
717
+ super(DiscriminatorS, self).__init__()
718
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
719
+ self.convs = nn.ModuleList(
720
+ [
721
+ norm_f(Conv1d(1, 16, 15, 1, padding=7)),
722
+ norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
723
+ norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
724
+ norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
725
+ norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
726
+ norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
727
+ ]
728
+ )
729
+ self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
730
+
731
+ def forward(self, x):
732
+ fmap = []
733
+
734
+ for l in self.convs:
735
+ x = l(x)
736
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
737
+ fmap.append(x)
738
+ x = self.conv_post(x)
739
+ fmap.append(x)
740
+ x = torch.flatten(x, 1, -1)
741
+
742
+ return x, fmap
743
+
744
+
745
+ class DiscriminatorP(torch.nn.Module):
746
+ def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
747
+ super(DiscriminatorP, self).__init__()
748
+ self.period = period
749
+ self.use_spectral_norm = use_spectral_norm
750
+ norm_f = weight_norm if use_spectral_norm == False else spectral_norm
751
+ self.convs = nn.ModuleList(
752
+ [
753
+ norm_f(
754
+ Conv2d(
755
+ 1,
756
+ 32,
757
+ (kernel_size, 1),
758
+ (stride, 1),
759
+ padding=(get_padding(kernel_size, 1), 0),
760
+ )
761
+ ),
762
+ norm_f(
763
+ Conv2d(
764
+ 32,
765
+ 128,
766
+ (kernel_size, 1),
767
+ (stride, 1),
768
+ padding=(get_padding(kernel_size, 1), 0),
769
+ )
770
+ ),
771
+ norm_f(
772
+ Conv2d(
773
+ 128,
774
+ 512,
775
+ (kernel_size, 1),
776
+ (stride, 1),
777
+ padding=(get_padding(kernel_size, 1), 0),
778
+ )
779
+ ),
780
+ norm_f(
781
+ Conv2d(
782
+ 512,
783
+ 1024,
784
+ (kernel_size, 1),
785
+ (stride, 1),
786
+ padding=(get_padding(kernel_size, 1), 0),
787
+ )
788
+ ),
789
+ norm_f(
790
+ Conv2d(
791
+ 1024,
792
+ 1024,
793
+ (kernel_size, 1),
794
+ 1,
795
+ padding=(get_padding(kernel_size, 1), 0),
796
+ )
797
+ ),
798
+ ]
799
+ )
800
+ self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
801
+
802
+ def forward(self, x):
803
+ fmap = []
804
+
805
+ # 1d to 2d
806
+ b, c, t = x.shape
807
+ if t % self.period != 0: # pad first
808
+ n_pad = self.period - (t % self.period)
809
+ x = F.pad(x, (0, n_pad), "reflect")
810
+ t = t + n_pad
811
+ x = x.view(b, c, t // self.period, self.period)
812
+
813
+ for l in self.convs:
814
+ x = l(x)
815
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
816
+ fmap.append(x)
817
+ x = self.conv_post(x)
818
+ fmap.append(x)
819
+ x = torch.flatten(x, 1, -1)
820
+
821
+ return x, fmap
rvc/lib/infer_pack/modules.py ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ from typing import Optional, Tuple
4
+
5
+ import numpy as np
6
+ import scipy
7
+ import torch
8
+ from torch import nn
9
+ from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d
10
+ from torch.nn import functional as F
11
+ from torch.nn.utils import remove_weight_norm, weight_norm
12
+
13
+ from rvc.lib.infer_pack import commons
14
+ from rvc.lib.infer_pack.commons import get_padding, init_weights
15
+ from rvc.lib.infer_pack.transforms import piecewise_rational_quadratic_transform
16
+
17
+ LRELU_SLOPE = 0.1
18
+
19
+
20
+ class LayerNorm(nn.Module):
21
+ def __init__(self, channels, eps=1e-5):
22
+ super(LayerNorm, self).__init__()
23
+ self.channels = channels
24
+ self.eps = eps
25
+
26
+ self.gamma = nn.Parameter(torch.ones(channels))
27
+ self.beta = nn.Parameter(torch.zeros(channels))
28
+
29
+ def forward(self, x):
30
+ x = x.transpose(1, -1)
31
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
32
+ return x.transpose(1, -1)
33
+
34
+
35
+ class ConvReluNorm(nn.Module):
36
+ def __init__(
37
+ self,
38
+ in_channels,
39
+ hidden_channels,
40
+ out_channels,
41
+ kernel_size,
42
+ n_layers,
43
+ p_dropout,
44
+ ):
45
+ super(ConvReluNorm, self).__init__()
46
+ self.in_channels = in_channels
47
+ self.hidden_channels = hidden_channels
48
+ self.out_channels = out_channels
49
+ self.kernel_size = kernel_size
50
+ self.n_layers = n_layers
51
+ self.p_dropout = float(p_dropout)
52
+ assert n_layers > 1, "Number of layers should be larger than 0."
53
+
54
+ self.conv_layers = nn.ModuleList()
55
+ self.norm_layers = nn.ModuleList()
56
+ self.conv_layers.append(
57
+ nn.Conv1d(
58
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
59
+ )
60
+ )
61
+ self.norm_layers.append(LayerNorm(hidden_channels))
62
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(float(p_dropout)))
63
+ for _ in range(n_layers - 1):
64
+ self.conv_layers.append(
65
+ nn.Conv1d(
66
+ hidden_channels,
67
+ hidden_channels,
68
+ kernel_size,
69
+ padding=kernel_size // 2,
70
+ )
71
+ )
72
+ self.norm_layers.append(LayerNorm(hidden_channels))
73
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
74
+ self.proj.weight.data.zero_()
75
+ self.proj.bias.data.zero_()
76
+
77
+ def forward(self, x, x_mask):
78
+ x_org = x
79
+ for i in range(self.n_layers):
80
+ x = self.conv_layers[i](x * x_mask)
81
+ x = self.norm_layers[i](x)
82
+ x = self.relu_drop(x)
83
+ x = x_org + self.proj(x)
84
+ return x * x_mask
85
+
86
+
87
+ class DDSConv(nn.Module):
88
+ """
89
+ Dialted and Depth-Separable Convolution
90
+ """
91
+
92
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
93
+ super(DDSConv, self).__init__()
94
+ self.channels = channels
95
+ self.kernel_size = kernel_size
96
+ self.n_layers = n_layers
97
+ self.p_dropout = float(p_dropout)
98
+
99
+ self.drop = nn.Dropout(float(p_dropout))
100
+ self.convs_sep = nn.ModuleList()
101
+ self.convs_1x1 = nn.ModuleList()
102
+ self.norms_1 = nn.ModuleList()
103
+ self.norms_2 = nn.ModuleList()
104
+ for i in range(n_layers):
105
+ dilation = kernel_size**i
106
+ padding = (kernel_size * dilation - dilation) // 2
107
+ self.convs_sep.append(
108
+ nn.Conv1d(
109
+ channels,
110
+ channels,
111
+ kernel_size,
112
+ groups=channels,
113
+ dilation=dilation,
114
+ padding=padding,
115
+ )
116
+ )
117
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
118
+ self.norms_1.append(LayerNorm(channels))
119
+ self.norms_2.append(LayerNorm(channels))
120
+
121
+ def forward(self, x, x_mask, g: Optional[torch.Tensor] = None):
122
+ if g is not None:
123
+ x = x + g
124
+ for i in range(self.n_layers):
125
+ y = self.convs_sep[i](x * x_mask)
126
+ y = self.norms_1[i](y)
127
+ y = F.gelu(y)
128
+ y = self.convs_1x1[i](y)
129
+ y = self.norms_2[i](y)
130
+ y = F.gelu(y)
131
+ y = self.drop(y)
132
+ x = x + y
133
+ return x * x_mask
134
+
135
+
136
+ class WN(torch.nn.Module):
137
+ def __init__(
138
+ self,
139
+ hidden_channels,
140
+ kernel_size,
141
+ dilation_rate,
142
+ n_layers,
143
+ gin_channels=0,
144
+ p_dropout=0,
145
+ ):
146
+ super(WN, self).__init__()
147
+ assert kernel_size % 2 == 1
148
+ self.hidden_channels = hidden_channels
149
+ self.kernel_size = (kernel_size,)
150
+ self.dilation_rate = dilation_rate
151
+ self.n_layers = n_layers
152
+ self.gin_channels = gin_channels
153
+ self.p_dropout = float(p_dropout)
154
+
155
+ self.in_layers = torch.nn.ModuleList()
156
+ self.res_skip_layers = torch.nn.ModuleList()
157
+ self.drop = nn.Dropout(float(p_dropout))
158
+
159
+ if gin_channels != 0:
160
+ cond_layer = torch.nn.Conv1d(
161
+ gin_channels, 2 * hidden_channels * n_layers, 1
162
+ )
163
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
164
+
165
+ for i in range(n_layers):
166
+ dilation = dilation_rate**i
167
+ padding = int((kernel_size * dilation - dilation) / 2)
168
+ in_layer = torch.nn.Conv1d(
169
+ hidden_channels,
170
+ 2 * hidden_channels,
171
+ kernel_size,
172
+ dilation=dilation,
173
+ padding=padding,
174
+ )
175
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
176
+ self.in_layers.append(in_layer)
177
+
178
+ # last one is not necessary
179
+ if i < n_layers - 1:
180
+ res_skip_channels = 2 * hidden_channels
181
+ else:
182
+ res_skip_channels = hidden_channels
183
+
184
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
185
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
186
+ self.res_skip_layers.append(res_skip_layer)
187
+
188
+ def forward(
189
+ self, x: torch.Tensor, x_mask: torch.Tensor, g: Optional[torch.Tensor] = None
190
+ ):
191
+ output = torch.zeros_like(x)
192
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
193
+
194
+ if g is not None:
195
+ g = self.cond_layer(g)
196
+
197
+ for i, (in_layer, res_skip_layer) in enumerate(
198
+ zip(self.in_layers, self.res_skip_layers)
199
+ ):
200
+ x_in = in_layer(x)
201
+ if g is not None:
202
+ cond_offset = i * 2 * self.hidden_channels
203
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
204
+ else:
205
+ g_l = torch.zeros_like(x_in)
206
+
207
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
208
+ acts = self.drop(acts)
209
+
210
+ res_skip_acts = res_skip_layer(acts)
211
+ if i < self.n_layers - 1:
212
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
213
+ x = (x + res_acts) * x_mask
214
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
215
+ else:
216
+ output = output + res_skip_acts
217
+ return output * x_mask
218
+
219
+ def remove_weight_norm(self):
220
+ if self.gin_channels != 0:
221
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
222
+ for l in self.in_layers:
223
+ torch.nn.utils.remove_weight_norm(l)
224
+ for l in self.res_skip_layers:
225
+ torch.nn.utils.remove_weight_norm(l)
226
+
227
+ def __prepare_scriptable__(self):
228
+ if self.gin_channels != 0:
229
+ for hook in self.cond_layer._forward_pre_hooks.values():
230
+ if (
231
+ hook.__module__ == "torch.nn.utils.weight_norm"
232
+ and hook.__class__.__name__ == "WeightNorm"
233
+ ):
234
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
235
+ for l in self.in_layers:
236
+ for hook in l._forward_pre_hooks.values():
237
+ if (
238
+ hook.__module__ == "torch.nn.utils.weight_norm"
239
+ and hook.__class__.__name__ == "WeightNorm"
240
+ ):
241
+ torch.nn.utils.remove_weight_norm(l)
242
+ for l in self.res_skip_layers:
243
+ for hook in l._forward_pre_hooks.values():
244
+ if (
245
+ hook.__module__ == "torch.nn.utils.weight_norm"
246
+ and hook.__class__.__name__ == "WeightNorm"
247
+ ):
248
+ torch.nn.utils.remove_weight_norm(l)
249
+ return self
250
+
251
+
252
+ class ResBlock1(torch.nn.Module):
253
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
254
+ super(ResBlock1, self).__init__()
255
+ self.convs1 = nn.ModuleList(
256
+ [
257
+ weight_norm(
258
+ Conv1d(
259
+ channels,
260
+ channels,
261
+ kernel_size,
262
+ 1,
263
+ dilation=dilation[0],
264
+ padding=get_padding(kernel_size, dilation[0]),
265
+ )
266
+ ),
267
+ weight_norm(
268
+ Conv1d(
269
+ channels,
270
+ channels,
271
+ kernel_size,
272
+ 1,
273
+ dilation=dilation[1],
274
+ padding=get_padding(kernel_size, dilation[1]),
275
+ )
276
+ ),
277
+ weight_norm(
278
+ Conv1d(
279
+ channels,
280
+ channels,
281
+ kernel_size,
282
+ 1,
283
+ dilation=dilation[2],
284
+ padding=get_padding(kernel_size, dilation[2]),
285
+ )
286
+ ),
287
+ ]
288
+ )
289
+ self.convs1.apply(init_weights)
290
+
291
+ self.convs2 = nn.ModuleList(
292
+ [
293
+ weight_norm(
294
+ Conv1d(
295
+ channels,
296
+ channels,
297
+ kernel_size,
298
+ 1,
299
+ dilation=1,
300
+ padding=get_padding(kernel_size, 1),
301
+ )
302
+ ),
303
+ weight_norm(
304
+ Conv1d(
305
+ channels,
306
+ channels,
307
+ kernel_size,
308
+ 1,
309
+ dilation=1,
310
+ padding=get_padding(kernel_size, 1),
311
+ )
312
+ ),
313
+ weight_norm(
314
+ Conv1d(
315
+ channels,
316
+ channels,
317
+ kernel_size,
318
+ 1,
319
+ dilation=1,
320
+ padding=get_padding(kernel_size, 1),
321
+ )
322
+ ),
323
+ ]
324
+ )
325
+ self.convs2.apply(init_weights)
326
+ self.lrelu_slope = LRELU_SLOPE
327
+
328
+ def forward(self, x: torch.Tensor, x_mask: Optional[torch.Tensor] = None):
329
+ for c1, c2 in zip(self.convs1, self.convs2):
330
+ xt = F.leaky_relu(x, self.lrelu_slope)
331
+ if x_mask is not None:
332
+ xt = xt * x_mask
333
+ xt = c1(xt)
334
+ xt = F.leaky_relu(xt, self.lrelu_slope)
335
+ if x_mask is not None:
336
+ xt = xt * x_mask
337
+ xt = c2(xt)
338
+ x = xt + x
339
+ if x_mask is not None:
340
+ x = x * x_mask
341
+ return x
342
+
343
+ def remove_weight_norm(self):
344
+ for l in self.convs1:
345
+ remove_weight_norm(l)
346
+ for l in self.convs2:
347
+ remove_weight_norm(l)
348
+
349
+ def __prepare_scriptable__(self):
350
+ for l in self.convs1:
351
+ for hook in l._forward_pre_hooks.values():
352
+ if (
353
+ hook.__module__ == "torch.nn.utils.weight_norm"
354
+ and hook.__class__.__name__ == "WeightNorm"
355
+ ):
356
+ torch.nn.utils.remove_weight_norm(l)
357
+ for l in self.convs2:
358
+ for hook in l._forward_pre_hooks.values():
359
+ if (
360
+ hook.__module__ == "torch.nn.utils.weight_norm"
361
+ and hook.__class__.__name__ == "WeightNorm"
362
+ ):
363
+ torch.nn.utils.remove_weight_norm(l)
364
+ return self
365
+
366
+
367
+ class ResBlock2(torch.nn.Module):
368
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
369
+ super(ResBlock2, self).__init__()
370
+ self.convs = nn.ModuleList(
371
+ [
372
+ weight_norm(
373
+ Conv1d(
374
+ channels,
375
+ channels,
376
+ kernel_size,
377
+ 1,
378
+ dilation=dilation[0],
379
+ padding=get_padding(kernel_size, dilation[0]),
380
+ )
381
+ ),
382
+ weight_norm(
383
+ Conv1d(
384
+ channels,
385
+ channels,
386
+ kernel_size,
387
+ 1,
388
+ dilation=dilation[1],
389
+ padding=get_padding(kernel_size, dilation[1]),
390
+ )
391
+ ),
392
+ ]
393
+ )
394
+ self.convs.apply(init_weights)
395
+ self.lrelu_slope = LRELU_SLOPE
396
+
397
+ def forward(self, x, x_mask: Optional[torch.Tensor] = None):
398
+ for c in self.convs:
399
+ xt = F.leaky_relu(x, self.lrelu_slope)
400
+ if x_mask is not None:
401
+ xt = xt * x_mask
402
+ xt = c(xt)
403
+ x = xt + x
404
+ if x_mask is not None:
405
+ x = x * x_mask
406
+ return x
407
+
408
+ def remove_weight_norm(self):
409
+ for l in self.convs:
410
+ remove_weight_norm(l)
411
+
412
+ def __prepare_scriptable__(self):
413
+ for l in self.convs:
414
+ for hook in l._forward_pre_hooks.values():
415
+ if (
416
+ hook.__module__ == "torch.nn.utils.weight_norm"
417
+ and hook.__class__.__name__ == "WeightNorm"
418
+ ):
419
+ torch.nn.utils.remove_weight_norm(l)
420
+ return self
421
+
422
+
423
+ class Log(nn.Module):
424
+ def forward(
425
+ self,
426
+ x: torch.Tensor,
427
+ x_mask: torch.Tensor,
428
+ g: Optional[torch.Tensor] = None,
429
+ reverse: bool = False,
430
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
431
+ if not reverse:
432
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
433
+ logdet = torch.sum(-y, [1, 2])
434
+ return y, logdet
435
+ else:
436
+ x = torch.exp(x) * x_mask
437
+ return x
438
+
439
+
440
+ class Flip(nn.Module):
441
+ # torch.jit.script() Compiled functions \
442
+ # can't take variable number of arguments or \
443
+ # use keyword-only arguments with defaults
444
+ def forward(
445
+ self,
446
+ x: torch.Tensor,
447
+ x_mask: torch.Tensor,
448
+ g: Optional[torch.Tensor] = None,
449
+ reverse: bool = False,
450
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
451
+ x = torch.flip(x, [1])
452
+ if not reverse:
453
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
454
+ return x, logdet
455
+ else:
456
+ return x, torch.zeros([1], device=x.device)
457
+
458
+
459
+ class ElementwiseAffine(nn.Module):
460
+ def __init__(self, channels):
461
+ super(ElementwiseAffine, self).__init__()
462
+ self.channels = channels
463
+ self.m = nn.Parameter(torch.zeros(channels, 1))
464
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
465
+
466
+ def forward(self, x, x_mask, reverse=False, **kwargs):
467
+ if not reverse:
468
+ y = self.m + torch.exp(self.logs) * x
469
+ y = y * x_mask
470
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
471
+ return y, logdet
472
+ else:
473
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
474
+ return x
475
+
476
+
477
+ class ResidualCouplingLayer(nn.Module):
478
+ def __init__(
479
+ self,
480
+ channels,
481
+ hidden_channels,
482
+ kernel_size,
483
+ dilation_rate,
484
+ n_layers,
485
+ p_dropout=0,
486
+ gin_channels=0,
487
+ mean_only=False,
488
+ ):
489
+ assert channels % 2 == 0, "channels should be divisible by 2"
490
+ super(ResidualCouplingLayer, self).__init__()
491
+ self.channels = channels
492
+ self.hidden_channels = hidden_channels
493
+ self.kernel_size = kernel_size
494
+ self.dilation_rate = dilation_rate
495
+ self.n_layers = n_layers
496
+ self.half_channels = channels // 2
497
+ self.mean_only = mean_only
498
+
499
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
500
+ self.enc = WN(
501
+ hidden_channels,
502
+ kernel_size,
503
+ dilation_rate,
504
+ n_layers,
505
+ p_dropout=float(p_dropout),
506
+ gin_channels=gin_channels,
507
+ )
508
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
509
+ self.post.weight.data.zero_()
510
+ self.post.bias.data.zero_()
511
+
512
+ def forward(
513
+ self,
514
+ x: torch.Tensor,
515
+ x_mask: torch.Tensor,
516
+ g: Optional[torch.Tensor] = None,
517
+ reverse: bool = False,
518
+ ):
519
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
520
+ h = self.pre(x0) * x_mask
521
+ h = self.enc(h, x_mask, g=g)
522
+ stats = self.post(h) * x_mask
523
+ if not self.mean_only:
524
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
525
+ else:
526
+ m = stats
527
+ logs = torch.zeros_like(m)
528
+
529
+ if not reverse:
530
+ x1 = m + x1 * torch.exp(logs) * x_mask
531
+ x = torch.cat([x0, x1], 1)
532
+ logdet = torch.sum(logs, [1, 2])
533
+ return x, logdet
534
+ else:
535
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
536
+ x = torch.cat([x0, x1], 1)
537
+ return x, torch.zeros([1])
538
+
539
+ def remove_weight_norm(self):
540
+ self.enc.remove_weight_norm()
541
+
542
+ def __prepare_scriptable__(self):
543
+ for hook in self.enc._forward_pre_hooks.values():
544
+ if (
545
+ hook.__module__ == "torch.nn.utils.weight_norm"
546
+ and hook.__class__.__name__ == "WeightNorm"
547
+ ):
548
+ torch.nn.utils.remove_weight_norm(self.enc)
549
+ return self
550
+
551
+
552
+ class ConvFlow(nn.Module):
553
+ def __init__(
554
+ self,
555
+ in_channels,
556
+ filter_channels,
557
+ kernel_size,
558
+ n_layers,
559
+ num_bins=10,
560
+ tail_bound=5.0,
561
+ ):
562
+ super(ConvFlow, self).__init__()
563
+ self.in_channels = in_channels
564
+ self.filter_channels = filter_channels
565
+ self.kernel_size = kernel_size
566
+ self.n_layers = n_layers
567
+ self.num_bins = num_bins
568
+ self.tail_bound = tail_bound
569
+ self.half_channels = in_channels // 2
570
+
571
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
572
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
573
+ self.proj = nn.Conv1d(
574
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
575
+ )
576
+ self.proj.weight.data.zero_()
577
+ self.proj.bias.data.zero_()
578
+
579
+ def forward(
580
+ self,
581
+ x: torch.Tensor,
582
+ x_mask: torch.Tensor,
583
+ g: Optional[torch.Tensor] = None,
584
+ reverse=False,
585
+ ):
586
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
587
+ h = self.pre(x0)
588
+ h = self.convs(h, x_mask, g=g)
589
+ h = self.proj(h) * x_mask
590
+
591
+ b, c, t = x0.shape
592
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
593
+
594
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
595
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
596
+ self.filter_channels
597
+ )
598
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
599
+
600
+ x1, logabsdet = piecewise_rational_quadratic_transform(
601
+ x1,
602
+ unnormalized_widths,
603
+ unnormalized_heights,
604
+ unnormalized_derivatives,
605
+ inverse=reverse,
606
+ tails="linear",
607
+ tail_bound=self.tail_bound,
608
+ )
609
+
610
+ x = torch.cat([x0, x1], 1) * x_mask
611
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
612
+ if not reverse:
613
+ return x, logdet
614
+ else:
615
+ return x
rvc/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pyworld
3
+
4
+ from rvc.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
+ class DioF0Predictor(F0Predictor):
8
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9
+ self.hop_length = hop_length
10
+ self.f0_min = f0_min
11
+ self.f0_max = f0_max
12
+ self.sampling_rate = sampling_rate
13
+
14
+ def interpolate_f0(self, f0):
15
+ """
16
+ 对F0进行插值处理
17
+ """
18
+
19
+ data = np.reshape(f0, (f0.size, 1))
20
+
21
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22
+ vuv_vector[data > 0.0] = 1.0
23
+ vuv_vector[data <= 0.0] = 0.0
24
+
25
+ ip_data = data
26
+
27
+ frame_number = data.size
28
+ last_value = 0.0
29
+ for i in range(frame_number):
30
+ if data[i] <= 0.0:
31
+ j = i + 1
32
+ for j in range(i + 1, frame_number):
33
+ if data[j] > 0.0:
34
+ break
35
+ if j < frame_number - 1:
36
+ if last_value > 0.0:
37
+ step = (data[j] - data[i - 1]) / float(j - i)
38
+ for k in range(i, j):
39
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
40
+ else:
41
+ for k in range(i, j):
42
+ ip_data[k] = data[j]
43
+ else:
44
+ for k in range(i, frame_number):
45
+ ip_data[k] = last_value
46
+ else:
47
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48
+ last_value = data[i]
49
+
50
+ return ip_data[:, 0], vuv_vector[:, 0]
51
+
52
+ def resize_f0(self, x, target_len):
53
+ source = np.array(x)
54
+ source[source < 0.001] = np.nan
55
+ target = np.interp(
56
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
57
+ np.arange(0, len(source)),
58
+ source,
59
+ )
60
+ res = np.nan_to_num(target)
61
+ return res
62
+
63
+ def compute_f0(self, wav, p_len=None):
64
+ if p_len is None:
65
+ p_len = wav.shape[0] // self.hop_length
66
+ f0, t = pyworld.dio(
67
+ wav.astype(np.double),
68
+ fs=self.sampling_rate,
69
+ f0_floor=self.f0_min,
70
+ f0_ceil=self.f0_max,
71
+ frame_period=1000 * self.hop_length / self.sampling_rate,
72
+ )
73
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
74
+ for index, pitch in enumerate(f0):
75
+ f0[index] = round(pitch, 1)
76
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
77
+
78
+ def compute_f0_uv(self, wav, p_len=None):
79
+ if p_len is None:
80
+ p_len = wav.shape[0] // self.hop_length
81
+ f0, t = pyworld.dio(
82
+ wav.astype(np.double),
83
+ fs=self.sampling_rate,
84
+ f0_floor=self.f0_min,
85
+ f0_ceil=self.f0_max,
86
+ frame_period=1000 * self.hop_length / self.sampling_rate,
87
+ )
88
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
89
+ for index, pitch in enumerate(f0):
90
+ f0[index] = round(pitch, 1)
91
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
rvc/lib/infer_pack/modules/F0Predictor/F0Predictor.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class F0Predictor(object):
2
+ def compute_f0(self, wav, p_len):
3
+ """
4
+ input: wav:[signal_length]
5
+ p_len:int
6
+ output: f0:[signal_length//hop_length]
7
+ """
8
+ pass
9
+
10
+ def compute_f0_uv(self, wav, p_len):
11
+ """
12
+ input: wav:[signal_length]
13
+ p_len:int
14
+ output: f0:[signal_length//hop_length],uv:[signal_length//hop_length]
15
+ """
16
+ pass
rvc/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pyworld
3
+
4
+ from rvc.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
+ class HarvestF0Predictor(F0Predictor):
8
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9
+ self.hop_length = hop_length
10
+ self.f0_min = f0_min
11
+ self.f0_max = f0_max
12
+ self.sampling_rate = sampling_rate
13
+
14
+ def interpolate_f0(self, f0):
15
+ """
16
+ 对F0进行插值处理
17
+ """
18
+
19
+ data = np.reshape(f0, (f0.size, 1))
20
+
21
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22
+ vuv_vector[data > 0.0] = 1.0
23
+ vuv_vector[data <= 0.0] = 0.0
24
+
25
+ ip_data = data
26
+
27
+ frame_number = data.size
28
+ last_value = 0.0
29
+ for i in range(frame_number):
30
+ if data[i] <= 0.0:
31
+ j = i + 1
32
+ for j in range(i + 1, frame_number):
33
+ if data[j] > 0.0:
34
+ break
35
+ if j < frame_number - 1:
36
+ if last_value > 0.0:
37
+ step = (data[j] - data[i - 1]) / float(j - i)
38
+ for k in range(i, j):
39
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
40
+ else:
41
+ for k in range(i, j):
42
+ ip_data[k] = data[j]
43
+ else:
44
+ for k in range(i, frame_number):
45
+ ip_data[k] = last_value
46
+ else:
47
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48
+ last_value = data[i]
49
+
50
+ return ip_data[:, 0], vuv_vector[:, 0]
51
+
52
+ def resize_f0(self, x, target_len):
53
+ source = np.array(x)
54
+ source[source < 0.001] = np.nan
55
+ target = np.interp(
56
+ np.arange(0, len(source) * target_len, len(source)) / target_len,
57
+ np.arange(0, len(source)),
58
+ source,
59
+ )
60
+ res = np.nan_to_num(target)
61
+ return res
62
+
63
+ def compute_f0(self, wav, p_len=None):
64
+ if p_len is None:
65
+ p_len = wav.shape[0] // self.hop_length
66
+ f0, t = pyworld.harvest(
67
+ wav.astype(np.double),
68
+ fs=self.sampling_rate,
69
+ f0_ceil=self.f0_max,
70
+ f0_floor=self.f0_min,
71
+ frame_period=1000 * self.hop_length / self.sampling_rate,
72
+ )
73
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.fs)
74
+ return self.interpolate_f0(self.resize_f0(f0, p_len))[0]
75
+
76
+ def compute_f0_uv(self, wav, p_len=None):
77
+ if p_len is None:
78
+ p_len = wav.shape[0] // self.hop_length
79
+ f0, t = pyworld.harvest(
80
+ wav.astype(np.double),
81
+ fs=self.sampling_rate,
82
+ f0_floor=self.f0_min,
83
+ f0_ceil=self.f0_max,
84
+ frame_period=1000 * self.hop_length / self.sampling_rate,
85
+ )
86
+ f0 = pyworld.stonemask(wav.astype(np.double), f0, t, self.sampling_rate)
87
+ return self.interpolate_f0(self.resize_f0(f0, p_len))
rvc/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import parselmouth
3
+
4
+ from rvc.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor
5
+
6
+
7
+ class PMF0Predictor(F0Predictor):
8
+ def __init__(self, hop_length=512, f0_min=50, f0_max=1100, sampling_rate=44100):
9
+ self.hop_length = hop_length
10
+ self.f0_min = f0_min
11
+ self.f0_max = f0_max
12
+ self.sampling_rate = sampling_rate
13
+
14
+ def interpolate_f0(self, f0):
15
+ """
16
+ 对F0进行插值处理
17
+ """
18
+
19
+ data = np.reshape(f0, (f0.size, 1))
20
+
21
+ vuv_vector = np.zeros((data.size, 1), dtype=np.float32)
22
+ vuv_vector[data > 0.0] = 1.0
23
+ vuv_vector[data <= 0.0] = 0.0
24
+
25
+ ip_data = data
26
+
27
+ frame_number = data.size
28
+ last_value = 0.0
29
+ for i in range(frame_number):
30
+ if data[i] <= 0.0:
31
+ j = i + 1
32
+ for j in range(i + 1, frame_number):
33
+ if data[j] > 0.0:
34
+ break
35
+ if j < frame_number - 1:
36
+ if last_value > 0.0:
37
+ step = (data[j] - data[i - 1]) / float(j - i)
38
+ for k in range(i, j):
39
+ ip_data[k] = data[i - 1] + step * (k - i + 1)
40
+ else:
41
+ for k in range(i, j):
42
+ ip_data[k] = data[j]
43
+ else:
44
+ for k in range(i, frame_number):
45
+ ip_data[k] = last_value
46
+ else:
47
+ ip_data[i] = data[i] # 这里可能存在一个没有必要的拷贝
48
+ last_value = data[i]
49
+
50
+ return ip_data[:, 0], vuv_vector[:, 0]
51
+
52
+ def compute_f0(self, wav, p_len=None):
53
+ x = wav
54
+ if p_len is None:
55
+ p_len = x.shape[0] // self.hop_length
56
+ else:
57
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
58
+ time_step = self.hop_length / self.sampling_rate * 1000
59
+ f0 = (
60
+ parselmouth.Sound(x, self.sampling_rate)
61
+ .to_pitch_ac(
62
+ time_step=time_step / 1000,
63
+ voicing_threshold=0.6,
64
+ pitch_floor=self.f0_min,
65
+ pitch_ceiling=self.f0_max,
66
+ )
67
+ .selected_array["frequency"]
68
+ )
69
+
70
+ pad_size = (p_len - len(f0) + 1) // 2
71
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
72
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
73
+ f0, uv = self.interpolate_f0(f0)
74
+ return f0
75
+
76
+ def compute_f0_uv(self, wav, p_len=None):
77
+ x = wav
78
+ if p_len is None:
79
+ p_len = x.shape[0] // self.hop_length
80
+ else:
81
+ assert abs(p_len - x.shape[0] // self.hop_length) < 4, "pad length error"
82
+ time_step = self.hop_length / self.sampling_rate * 1000
83
+ f0 = (
84
+ parselmouth.Sound(x, self.sampling_rate)
85
+ .to_pitch_ac(
86
+ time_step=time_step / 1000,
87
+ voicing_threshold=0.6,
88
+ pitch_floor=self.f0_min,
89
+ pitch_ceiling=self.f0_max,
90
+ )
91
+ .selected_array["frequency"]
92
+ )
93
+
94
+ pad_size = (p_len - len(f0) + 1) // 2
95
+ if pad_size > 0 or p_len - len(f0) - pad_size > 0:
96
+ f0 = np.pad(f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant")
97
+ f0, uv = self.interpolate_f0(f0)
98
+ return f0, uv
rvc/lib/infer_pack/modules/F0Predictor/__init__.py ADDED
File without changes
rvc/lib/infer_pack/onnx_inference.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import librosa
4
+ import numpy as np
5
+ import onnxruntime
6
+ import soundfile
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class ContentVec:
12
+ def __init__(self, vec_path="pretrained/vec-768-layer-12.onnx", device=None):
13
+ logger.info("Load model(s) from {}".format(vec_path))
14
+ if device == "cpu" or device is None:
15
+ providers = ["CPUExecutionProvider"]
16
+ elif device == "cuda":
17
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
18
+ elif device == "dml":
19
+ providers = ["DmlExecutionProvider"]
20
+ else:
21
+ raise RuntimeError("Unsportted Device")
22
+ self.model = onnxruntime.InferenceSession(vec_path, providers=providers)
23
+
24
+ def __call__(self, wav):
25
+ return self.forward(wav)
26
+
27
+ def forward(self, wav):
28
+ feats = wav
29
+ if feats.ndim == 2: # double channels
30
+ feats = feats.mean(-1)
31
+ assert feats.ndim == 1, feats.ndim
32
+ feats = np.expand_dims(np.expand_dims(feats, 0), 0)
33
+ onnx_input = {self.model.get_inputs()[0].name: feats}
34
+ logits = self.model.run(None, onnx_input)[0]
35
+ return logits.transpose(0, 2, 1)
36
+
37
+
38
+ def get_f0_predictor(f0_predictor, hop_length, sampling_rate, **kargs):
39
+ if f0_predictor == "pm":
40
+ from lib.infer_pack.modules.F0Predictor.PMF0Predictor import PMF0Predictor
41
+
42
+ f0_predictor_object = PMF0Predictor(
43
+ hop_length=hop_length, sampling_rate=sampling_rate
44
+ )
45
+ elif f0_predictor == "harvest":
46
+ from lib.infer_pack.modules.F0Predictor.HarvestF0Predictor import (
47
+ HarvestF0Predictor,
48
+ )
49
+
50
+ f0_predictor_object = HarvestF0Predictor(
51
+ hop_length=hop_length, sampling_rate=sampling_rate
52
+ )
53
+ elif f0_predictor == "dio":
54
+ from lib.infer_pack.modules.F0Predictor.DioF0Predictor import DioF0Predictor
55
+
56
+ f0_predictor_object = DioF0Predictor(
57
+ hop_length=hop_length, sampling_rate=sampling_rate
58
+ )
59
+ else:
60
+ raise Exception("Unknown f0 predictor")
61
+ return f0_predictor_object
62
+
63
+
64
+ class OnnxRVC:
65
+ def __init__(
66
+ self,
67
+ model_path,
68
+ sr=40000,
69
+ hop_size=512,
70
+ vec_path="vec-768-layer-12",
71
+ device="cpu",
72
+ ):
73
+ vec_path = f"pretrained/{vec_path}.onnx"
74
+ self.vec_model = ContentVec(vec_path, device)
75
+ if device == "cpu" or device is None:
76
+ providers = ["CPUExecutionProvider"]
77
+ elif device == "cuda":
78
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
79
+ elif device == "dml":
80
+ providers = ["DmlExecutionProvider"]
81
+ else:
82
+ raise RuntimeError("Unsportted Device")
83
+ self.model = onnxruntime.InferenceSession(model_path, providers=providers)
84
+ self.sampling_rate = sr
85
+ self.hop_size = hop_size
86
+
87
+ def forward(self, hubert, hubert_length, pitch, pitchf, ds, rnd):
88
+ onnx_input = {
89
+ self.model.get_inputs()[0].name: hubert,
90
+ self.model.get_inputs()[1].name: hubert_length,
91
+ self.model.get_inputs()[2].name: pitch,
92
+ self.model.get_inputs()[3].name: pitchf,
93
+ self.model.get_inputs()[4].name: ds,
94
+ self.model.get_inputs()[5].name: rnd,
95
+ }
96
+ return (self.model.run(None, onnx_input)[0] * 32767).astype(np.int16)
97
+
98
+ def inference(
99
+ self,
100
+ raw_path,
101
+ sid,
102
+ f0_method="dio",
103
+ f0_up_key=0,
104
+ pad_time=0.5,
105
+ cr_threshold=0.02,
106
+ ):
107
+ f0_min = 50
108
+ f0_max = 1100
109
+ f0_mel_min = 1127 * np.log(1 + f0_min / 700)
110
+ f0_mel_max = 1127 * np.log(1 + f0_max / 700)
111
+ f0_predictor = get_f0_predictor(
112
+ f0_method,
113
+ hop_length=self.hop_size,
114
+ sampling_rate=self.sampling_rate,
115
+ threshold=cr_threshold,
116
+ )
117
+ wav, sr = librosa.load(raw_path, sr=self.sampling_rate)
118
+ org_length = len(wav)
119
+ if org_length / sr > 50.0:
120
+ raise RuntimeError("Reached Max Length")
121
+
122
+ wav16k = librosa.resample(wav, orig_sr=self.sampling_rate, target_sr=16000)
123
+ wav16k = wav16k
124
+
125
+ hubert = self.vec_model(wav16k)
126
+ hubert = np.repeat(hubert, 2, axis=2).transpose(0, 2, 1).astype(np.float32)
127
+ hubert_length = hubert.shape[1]
128
+
129
+ pitchf = f0_predictor.compute_f0(wav, hubert_length)
130
+ pitchf = pitchf * 2 ** (f0_up_key / 12)
131
+ pitch = pitchf.copy()
132
+ f0_mel = 1127 * np.log(1 + pitch / 700)
133
+ f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / (
134
+ f0_mel_max - f0_mel_min
135
+ ) + 1
136
+ f0_mel[f0_mel <= 1] = 1
137
+ f0_mel[f0_mel > 255] = 255
138
+ pitch = np.rint(f0_mel).astype(np.int64)
139
+
140
+ pitchf = pitchf.reshape(1, len(pitchf)).astype(np.float32)
141
+ pitch = pitch.reshape(1, len(pitch))
142
+ ds = np.array([sid]).astype(np.int64)
143
+
144
+ rnd = np.random.randn(1, 192, hubert_length).astype(np.float32)
145
+ hubert_length = np.array([hubert_length]).astype(np.int64)
146
+
147
+ out_wav = self.forward(hubert, hubert_length, pitch, pitchf, ds, rnd).squeeze()
148
+ out_wav = np.pad(out_wav, (0, 2 * self.hop_size), "constant")
149
+ return out_wav[0:org_length]
rvc/lib/infer_pack/transforms.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
6
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
7
+ DEFAULT_MIN_DERIVATIVE = 1e-3
8
+
9
+
10
+ def piecewise_rational_quadratic_transform(
11
+ inputs,
12
+ unnormalized_widths,
13
+ unnormalized_heights,
14
+ unnormalized_derivatives,
15
+ inverse=False,
16
+ tails=None,
17
+ tail_bound=1.0,
18
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
19
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
20
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
21
+ ):
22
+ if tails is None:
23
+ spline_fn = rational_quadratic_spline
24
+ spline_kwargs = {}
25
+ else:
26
+ spline_fn = unconstrained_rational_quadratic_spline
27
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
28
+
29
+ outputs, logabsdet = spline_fn(
30
+ inputs=inputs,
31
+ unnormalized_widths=unnormalized_widths,
32
+ unnormalized_heights=unnormalized_heights,
33
+ unnormalized_derivatives=unnormalized_derivatives,
34
+ inverse=inverse,
35
+ min_bin_width=min_bin_width,
36
+ min_bin_height=min_bin_height,
37
+ min_derivative=min_derivative,
38
+ **spline_kwargs
39
+ )
40
+ return outputs, logabsdet
41
+
42
+
43
+ def searchsorted(bin_locations, inputs, eps=1e-6):
44
+ bin_locations[..., -1] += eps
45
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
46
+
47
+
48
+ def unconstrained_rational_quadratic_spline(
49
+ inputs,
50
+ unnormalized_widths,
51
+ unnormalized_heights,
52
+ unnormalized_derivatives,
53
+ inverse=False,
54
+ tails="linear",
55
+ tail_bound=1.0,
56
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
57
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
58
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
59
+ ):
60
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
61
+ outside_interval_mask = ~inside_interval_mask
62
+
63
+ outputs = torch.zeros_like(inputs)
64
+ logabsdet = torch.zeros_like(inputs)
65
+
66
+ if tails == "linear":
67
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
68
+ constant = np.log(np.exp(1 - min_derivative) - 1)
69
+ unnormalized_derivatives[..., 0] = constant
70
+ unnormalized_derivatives[..., -1] = constant
71
+
72
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
73
+ logabsdet[outside_interval_mask] = 0
74
+ else:
75
+ raise RuntimeError("{} tails are not implemented.".format(tails))
76
+
77
+ (
78
+ outputs[inside_interval_mask],
79
+ logabsdet[inside_interval_mask],
80
+ ) = rational_quadratic_spline(
81
+ inputs=inputs[inside_interval_mask],
82
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
83
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
84
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
85
+ inverse=inverse,
86
+ left=-tail_bound,
87
+ right=tail_bound,
88
+ bottom=-tail_bound,
89
+ top=tail_bound,
90
+ min_bin_width=min_bin_width,
91
+ min_bin_height=min_bin_height,
92
+ min_derivative=min_derivative,
93
+ )
94
+
95
+ return outputs, logabsdet
96
+
97
+
98
+ def rational_quadratic_spline(
99
+ inputs,
100
+ unnormalized_widths,
101
+ unnormalized_heights,
102
+ unnormalized_derivatives,
103
+ inverse=False,
104
+ left=0.0,
105
+ right=1.0,
106
+ bottom=0.0,
107
+ top=1.0,
108
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
109
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
110
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
111
+ ):
112
+ if torch.min(inputs) < left or torch.max(inputs) > right:
113
+ raise ValueError("Input to a transform is not within its domain")
114
+
115
+ num_bins = unnormalized_widths.shape[-1]
116
+
117
+ if min_bin_width * num_bins > 1.0:
118
+ raise ValueError("Minimal bin width too large for the number of bins")
119
+ if min_bin_height * num_bins > 1.0:
120
+ raise ValueError("Minimal bin height too large for the number of bins")
121
+
122
+ widths = F.softmax(unnormalized_widths, dim=-1)
123
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
124
+ cumwidths = torch.cumsum(widths, dim=-1)
125
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
126
+ cumwidths = (right - left) * cumwidths + left
127
+ cumwidths[..., 0] = left
128
+ cumwidths[..., -1] = right
129
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
130
+
131
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
132
+
133
+ heights = F.softmax(unnormalized_heights, dim=-1)
134
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
135
+ cumheights = torch.cumsum(heights, dim=-1)
136
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
137
+ cumheights = (top - bottom) * cumheights + bottom
138
+ cumheights[..., 0] = bottom
139
+ cumheights[..., -1] = top
140
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
141
+
142
+ if inverse:
143
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
144
+ else:
145
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
146
+
147
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
148
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
149
+
150
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
151
+ delta = heights / widths
152
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
153
+
154
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
155
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
156
+
157
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
158
+
159
+ if inverse:
160
+ a = (inputs - input_cumheights) * (
161
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
162
+ ) + input_heights * (input_delta - input_derivatives)
163
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
164
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
165
+ )
166
+ c = -input_delta * (inputs - input_cumheights)
167
+
168
+ discriminant = b.pow(2) - 4 * a * c
169
+ assert (discriminant >= 0).all()
170
+
171
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
172
+ outputs = root * input_bin_widths + input_cumwidths
173
+
174
+ theta_one_minus_theta = root * (1 - root)
175
+ denominator = input_delta + (
176
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
177
+ * theta_one_minus_theta
178
+ )
179
+ derivative_numerator = input_delta.pow(2) * (
180
+ input_derivatives_plus_one * root.pow(2)
181
+ + 2 * input_delta * theta_one_minus_theta
182
+ + input_derivatives * (1 - root).pow(2)
183
+ )
184
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
185
+
186
+ return outputs, -logabsdet
187
+ else:
188
+ theta = (inputs - input_cumwidths) / input_bin_widths
189
+ theta_one_minus_theta = theta * (1 - theta)
190
+
191
+ numerator = input_heights * (
192
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
193
+ )
194
+ denominator = input_delta + (
195
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
196
+ * theta_one_minus_theta
197
+ )
198
+ outputs = input_cumheights + numerator / denominator
199
+
200
+ derivative_numerator = input_delta.pow(2) * (
201
+ input_derivatives_plus_one * theta.pow(2)
202
+ + 2 * input_delta * theta_one_minus_theta
203
+ + input_derivatives * (1 - theta).pow(2)
204
+ )
205
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
206
+
207
+ return outputs, logabsdet
rvc/lib/ipex/__init__.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import os
3
+ import sys
4
+
5
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
6
+ import torch
7
+
8
+ from .attention import attention_init
9
+ from .hijacks import ipex_hijacks
10
+
11
+
12
+ def ipex_init():
13
+ try:
14
+ # Replace cuda with xpu:
15
+ torch.cuda.current_device = torch.xpu.current_device
16
+ torch.cuda.current_stream = torch.xpu.current_stream
17
+ torch.cuda.device = torch.xpu.device
18
+ torch.cuda.device_count = torch.xpu.device_count
19
+ torch.cuda.device_of = torch.xpu.device_of
20
+ torch.cuda.get_device_name = torch.xpu.get_device_name
21
+ torch.cuda.get_device_properties = torch.xpu.get_device_properties
22
+ torch.cuda.init = torch.xpu.init
23
+ torch.cuda.is_available = torch.xpu.is_available
24
+ torch.cuda.is_initialized = torch.xpu.is_initialized
25
+ torch.cuda.is_current_stream_capturing = lambda: False
26
+ torch.cuda.set_device = torch.xpu.set_device
27
+ torch.cuda.stream = torch.xpu.stream
28
+ torch.cuda.synchronize = torch.xpu.synchronize
29
+ torch.cuda.Event = torch.xpu.Event
30
+ torch.cuda.Stream = torch.xpu.Stream
31
+ torch.cuda.FloatTensor = torch.xpu.FloatTensor
32
+ torch.Tensor.cuda = torch.Tensor.xpu
33
+ torch.Tensor.is_cuda = torch.Tensor.is_xpu
34
+ torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
35
+ torch.cuda._initialized = torch.xpu.lazy_init._initialized
36
+ torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
37
+ torch.cuda._queued_calls = torch.xpu.lazy_init._queued_calls
38
+ torch.cuda._tls = torch.xpu.lazy_init._tls
39
+ torch.cuda.threading = torch.xpu.lazy_init.threading
40
+ torch.cuda.traceback = torch.xpu.lazy_init.traceback
41
+ torch.cuda.Optional = torch.xpu.Optional
42
+ torch.cuda.__cached__ = torch.xpu.__cached__
43
+ torch.cuda.__loader__ = torch.xpu.__loader__
44
+ torch.cuda.ComplexFloatStorage = torch.xpu.ComplexFloatStorage
45
+ torch.cuda.Tuple = torch.xpu.Tuple
46
+ torch.cuda.streams = torch.xpu.streams
47
+ torch.cuda._lazy_new = torch.xpu._lazy_new
48
+ torch.cuda.FloatStorage = torch.xpu.FloatStorage
49
+ torch.cuda.Any = torch.xpu.Any
50
+ torch.cuda.__doc__ = torch.xpu.__doc__
51
+ torch.cuda.default_generators = torch.xpu.default_generators
52
+ torch.cuda.HalfTensor = torch.xpu.HalfTensor
53
+ torch.cuda._get_device_index = torch.xpu._get_device_index
54
+ torch.cuda.__path__ = torch.xpu.__path__
55
+ torch.cuda.Device = torch.xpu.Device
56
+ torch.cuda.IntTensor = torch.xpu.IntTensor
57
+ torch.cuda.ByteStorage = torch.xpu.ByteStorage
58
+ torch.cuda.set_stream = torch.xpu.set_stream
59
+ torch.cuda.BoolStorage = torch.xpu.BoolStorage
60
+ torch.cuda.os = torch.xpu.os
61
+ torch.cuda.torch = torch.xpu.torch
62
+ torch.cuda.BFloat16Storage = torch.xpu.BFloat16Storage
63
+ torch.cuda.Union = torch.xpu.Union
64
+ torch.cuda.DoubleTensor = torch.xpu.DoubleTensor
65
+ torch.cuda.ShortTensor = torch.xpu.ShortTensor
66
+ torch.cuda.LongTensor = torch.xpu.LongTensor
67
+ torch.cuda.IntStorage = torch.xpu.IntStorage
68
+ torch.cuda.LongStorage = torch.xpu.LongStorage
69
+ torch.cuda.__annotations__ = torch.xpu.__annotations__
70
+ torch.cuda.__package__ = torch.xpu.__package__
71
+ torch.cuda.__builtins__ = torch.xpu.__builtins__
72
+ torch.cuda.CharTensor = torch.xpu.CharTensor
73
+ torch.cuda.List = torch.xpu.List
74
+ torch.cuda._lazy_init = torch.xpu._lazy_init
75
+ torch.cuda.BFloat16Tensor = torch.xpu.BFloat16Tensor
76
+ torch.cuda.DoubleStorage = torch.xpu.DoubleStorage
77
+ torch.cuda.ByteTensor = torch.xpu.ByteTensor
78
+ torch.cuda.StreamContext = torch.xpu.StreamContext
79
+ torch.cuda.ComplexDoubleStorage = torch.xpu.ComplexDoubleStorage
80
+ torch.cuda.ShortStorage = torch.xpu.ShortStorage
81
+ torch.cuda._lazy_call = torch.xpu._lazy_call
82
+ torch.cuda.HalfStorage = torch.xpu.HalfStorage
83
+ torch.cuda.random = torch.xpu.random
84
+ torch.cuda._device = torch.xpu._device
85
+ torch.cuda.classproperty = torch.xpu.classproperty
86
+ torch.cuda.__name__ = torch.xpu.__name__
87
+ torch.cuda._device_t = torch.xpu._device_t
88
+ torch.cuda.warnings = torch.xpu.warnings
89
+ torch.cuda.__spec__ = torch.xpu.__spec__
90
+ torch.cuda.BoolTensor = torch.xpu.BoolTensor
91
+ torch.cuda.CharStorage = torch.xpu.CharStorage
92
+ torch.cuda.__file__ = torch.xpu.__file__
93
+ torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
94
+ # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
95
+
96
+ # Memory:
97
+ torch.cuda.memory = torch.xpu.memory
98
+ if "linux" in sys.platform and "WSL2" in os.popen("uname -a").read():
99
+ torch.xpu.empty_cache = lambda: None
100
+ torch.cuda.empty_cache = torch.xpu.empty_cache
101
+ torch.cuda.memory_stats = torch.xpu.memory_stats
102
+ torch.cuda.memory_summary = torch.xpu.memory_summary
103
+ torch.cuda.memory_snapshot = torch.xpu.memory_snapshot
104
+ torch.cuda.memory_allocated = torch.xpu.memory_allocated
105
+ torch.cuda.max_memory_allocated = torch.xpu.max_memory_allocated
106
+ torch.cuda.memory_reserved = torch.xpu.memory_reserved
107
+ torch.cuda.memory_cached = torch.xpu.memory_reserved
108
+ torch.cuda.max_memory_reserved = torch.xpu.max_memory_reserved
109
+ torch.cuda.max_memory_cached = torch.xpu.max_memory_reserved
110
+ torch.cuda.reset_peak_memory_stats = torch.xpu.reset_peak_memory_stats
111
+ torch.cuda.reset_max_memory_cached = torch.xpu.reset_peak_memory_stats
112
+ torch.cuda.reset_max_memory_allocated = torch.xpu.reset_peak_memory_stats
113
+ torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
114
+ torch.cuda.reset_accumulated_memory_stats = (
115
+ torch.xpu.reset_accumulated_memory_stats
116
+ )
117
+
118
+ # RNG:
119
+ torch.cuda.get_rng_state = torch.xpu.get_rng_state
120
+ torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
121
+ torch.cuda.set_rng_state = torch.xpu.set_rng_state
122
+ torch.cuda.set_rng_state_all = torch.xpu.set_rng_state_all
123
+ torch.cuda.manual_seed = torch.xpu.manual_seed
124
+ torch.cuda.manual_seed_all = torch.xpu.manual_seed_all
125
+ torch.cuda.seed = torch.xpu.seed
126
+ torch.cuda.seed_all = torch.xpu.seed_all
127
+ torch.cuda.initial_seed = torch.xpu.initial_seed
128
+
129
+ # AMP:
130
+ torch.cuda.amp = torch.xpu.amp
131
+ if not hasattr(torch.cuda.amp, "common"):
132
+ torch.cuda.amp.common = contextlib.nullcontext()
133
+ torch.cuda.amp.common.amp_definitely_not_available = lambda: False
134
+ try:
135
+ torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
136
+ except Exception:
137
+ try:
138
+ from .gradscaler import gradscaler_init
139
+
140
+ gradscaler_init()
141
+ torch.cuda.amp.GradScaler = torch.xpu.amp.GradScaler
142
+ except Exception:
143
+ torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
144
+
145
+ # C
146
+ torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
147
+ ipex._C._DeviceProperties.major = 2023
148
+ ipex._C._DeviceProperties.minor = 2
149
+
150
+ # Fix functions with ipex:
151
+ torch.cuda.mem_get_info = lambda device=None: [
152
+ (
153
+ torch.xpu.get_device_properties(device).total_memory
154
+ - torch.xpu.memory_allocated(device)
155
+ ),
156
+ torch.xpu.get_device_properties(device).total_memory,
157
+ ]
158
+ torch._utils._get_available_device_type = lambda: "xpu"
159
+ torch.has_cuda = True
160
+ torch.cuda.has_half = True
161
+ torch.cuda.is_bf16_supported = lambda *args, **kwargs: True
162
+ torch.cuda.is_fp16_supported = lambda *args, **kwargs: True
163
+ torch.version.cuda = "11.7"
164
+ torch.cuda.get_device_capability = lambda *args, **kwargs: [11, 7]
165
+ torch.cuda.get_device_properties.major = 11
166
+ torch.cuda.get_device_properties.minor = 7
167
+ torch.cuda.ipc_collect = lambda *args, **kwargs: None
168
+ torch.cuda.utilization = lambda *args, **kwargs: 0
169
+ if hasattr(torch.xpu, "getDeviceIdListForCard"):
170
+ torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
171
+ torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
172
+ else:
173
+ torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
174
+ torch.cuda.get_device_id_list_per_card = (
175
+ torch.xpu.get_device_id_list_per_card
176
+ )
177
+
178
+ ipex_hijacks()
179
+ attention_init()
180
+ except Exception as e:
181
+ return False, e
182
+ return True, None
rvc/lib/ipex/attention.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import intel_extension_for_pytorch as ipex
2
+ import torch
3
+
4
+ original_torch_bmm = torch.bmm
5
+
6
+
7
+ def torch_bmm(input, mat2, *, out=None):
8
+ if input.dtype != mat2.dtype:
9
+ mat2 = mat2.to(input.dtype)
10
+
11
+ # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
12
+ batch_size_attention, input_tokens, mat2_shape = (
13
+ input.shape[0],
14
+ input.shape[1],
15
+ mat2.shape[2],
16
+ )
17
+ block_multiply = input.element_size()
18
+ slice_block_size = input_tokens * mat2_shape / 1024 / 1024 * block_multiply
19
+ block_size = batch_size_attention * slice_block_size
20
+
21
+ split_slice_size = batch_size_attention
22
+ if block_size > 4:
23
+ do_split = True
24
+ # Find something divisible with the input_tokens
25
+ while (split_slice_size * slice_block_size) > 4:
26
+ split_slice_size = split_slice_size // 2
27
+ if split_slice_size <= 1:
28
+ split_slice_size = 1
29
+ break
30
+ else:
31
+ do_split = False
32
+
33
+ split_2_slice_size = input_tokens
34
+ if split_slice_size * slice_block_size > 4:
35
+ slice_block_size2 = split_slice_size * mat2_shape / 1024 / 1024 * block_multiply
36
+ do_split_2 = True
37
+ # Find something divisible with the input_tokens
38
+ while (split_2_slice_size * slice_block_size2) > 4:
39
+ split_2_slice_size = split_2_slice_size // 2
40
+ if split_2_slice_size <= 1:
41
+ split_2_slice_size = 1
42
+ break
43
+ else:
44
+ do_split_2 = False
45
+
46
+ if do_split:
47
+ hidden_states = torch.zeros(
48
+ input.shape[0],
49
+ input.shape[1],
50
+ mat2.shape[2],
51
+ device=input.device,
52
+ dtype=input.dtype,
53
+ )
54
+ for i in range(batch_size_attention // split_slice_size):
55
+ start_idx = i * split_slice_size
56
+ end_idx = (i + 1) * split_slice_size
57
+ if do_split_2:
58
+ for i2 in range(input_tokens // split_2_slice_size):
59
+ start_idx_2 = i2 * split_2_slice_size
60
+ end_idx_2 = (i2 + 1) * split_2_slice_size
61
+ hidden_states[
62
+ start_idx:end_idx, start_idx_2:end_idx_2
63
+ ] = original_torch_bmm(
64
+ input[start_idx:end_idx, start_idx_2:end_idx_2],
65
+ mat2[start_idx:end_idx, start_idx_2:end_idx_2],
66
+ out=out,
67
+ )
68
+ else:
69
+ hidden_states[start_idx:end_idx] = original_torch_bmm(
70
+ input[start_idx:end_idx], mat2[start_idx:end_idx], out=out
71
+ )
72
+ else:
73
+ return original_torch_bmm(input, mat2, out=out)
74
+ return hidden_states
75
+
76
+
77
+ original_scaled_dot_product_attention = torch.nn.functional.scaled_dot_product_attention
78
+
79
+
80
+ def scaled_dot_product_attention(
81
+ query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False
82
+ ):
83
+ # ARC GPUs can't allocate more than 4GB to a single block, Slice it:
84
+ if len(query.shape) == 3:
85
+ batch_size_attention, query_tokens, shape_four = query.shape
86
+ shape_one = 1
87
+ no_shape_one = True
88
+ else:
89
+ shape_one, batch_size_attention, query_tokens, shape_four = query.shape
90
+ no_shape_one = False
91
+
92
+ block_multiply = query.element_size()
93
+ slice_block_size = (
94
+ shape_one * query_tokens * shape_four / 1024 / 1024 * block_multiply
95
+ )
96
+ block_size = batch_size_attention * slice_block_size
97
+
98
+ split_slice_size = batch_size_attention
99
+ if block_size > 4:
100
+ do_split = True
101
+ # Find something divisible with the shape_one
102
+ while (split_slice_size * slice_block_size) > 4:
103
+ split_slice_size = split_slice_size // 2
104
+ if split_slice_size <= 1:
105
+ split_slice_size = 1
106
+ break
107
+ else:
108
+ do_split = False
109
+
110
+ split_2_slice_size = query_tokens
111
+ if split_slice_size * slice_block_size > 4:
112
+ slice_block_size2 = (
113
+ shape_one * split_slice_size * shape_four / 1024 / 1024 * block_multiply
114
+ )
115
+ do_split_2 = True
116
+ # Find something divisible with the batch_size_attention
117
+ while (split_2_slice_size * slice_block_size2) > 4:
118
+ split_2_slice_size = split_2_slice_size // 2
119
+ if split_2_slice_size <= 1:
120
+ split_2_slice_size = 1
121
+ break
122
+ else:
123
+ do_split_2 = False
124
+
125
+ if do_split:
126
+ hidden_states = torch.zeros(query.shape, device=query.device, dtype=query.dtype)
127
+ for i in range(batch_size_attention // split_slice_size):
128
+ start_idx = i * split_slice_size
129
+ end_idx = (i + 1) * split_slice_size
130
+ if do_split_2:
131
+ for i2 in range(query_tokens // split_2_slice_size):
132
+ start_idx_2 = i2 * split_2_slice_size
133
+ end_idx_2 = (i2 + 1) * split_2_slice_size
134
+ if no_shape_one:
135
+ hidden_states[
136
+ start_idx:end_idx, start_idx_2:end_idx_2
137
+ ] = original_scaled_dot_product_attention(
138
+ query[start_idx:end_idx, start_idx_2:end_idx_2],
139
+ key[start_idx:end_idx, start_idx_2:end_idx_2],
140
+ value[start_idx:end_idx, start_idx_2:end_idx_2],
141
+ attn_mask=attn_mask[
142
+ start_idx:end_idx, start_idx_2:end_idx_2
143
+ ]
144
+ if attn_mask is not None
145
+ else attn_mask,
146
+ dropout_p=dropout_p,
147
+ is_causal=is_causal,
148
+ )
149
+ else:
150
+ hidden_states[
151
+ :, start_idx:end_idx, start_idx_2:end_idx_2
152
+ ] = original_scaled_dot_product_attention(
153
+ query[:, start_idx:end_idx, start_idx_2:end_idx_2],
154
+ key[:, start_idx:end_idx, start_idx_2:end_idx_2],
155
+ value[:, start_idx:end_idx, start_idx_2:end_idx_2],
156
+ attn_mask=attn_mask[
157
+ :, start_idx:end_idx, start_idx_2:end_idx_2
158
+ ]
159
+ if attn_mask is not None
160
+ else attn_mask,
161
+ dropout_p=dropout_p,
162
+ is_causal=is_causal,
163
+ )
164
+ else:
165
+ if no_shape_one:
166
+ hidden_states[
167
+ start_idx:end_idx
168
+ ] = original_scaled_dot_product_attention(
169
+ query[start_idx:end_idx],
170
+ key[start_idx:end_idx],
171
+ value[start_idx:end_idx],
172
+ attn_mask=attn_mask[start_idx:end_idx]
173
+ if attn_mask is not None
174
+ else attn_mask,
175
+ dropout_p=dropout_p,
176
+ is_causal=is_causal,
177
+ )
178
+ else:
179
+ hidden_states[
180
+ :, start_idx:end_idx
181
+ ] = original_scaled_dot_product_attention(
182
+ query[:, start_idx:end_idx],
183
+ key[:, start_idx:end_idx],
184
+ value[:, start_idx:end_idx],
185
+ attn_mask=attn_mask[:, start_idx:end_idx]
186
+ if attn_mask is not None
187
+ else attn_mask,
188
+ dropout_p=dropout_p,
189
+ is_causal=is_causal,
190
+ )
191
+ else:
192
+ return original_scaled_dot_product_attention(
193
+ query,
194
+ key,
195
+ value,
196
+ attn_mask=attn_mask,
197
+ dropout_p=dropout_p,
198
+ is_causal=is_causal,
199
+ )
200
+ return hidden_states
201
+
202
+
203
+ def attention_init():
204
+ # ARC GPUs can't allocate more than 4GB to a single block:
205
+ torch.bmm = torch_bmm
206
+ torch.nn.functional.scaled_dot_product_attention = scaled_dot_product_attention
rvc/lib/ipex/gradscaler.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+
3
+ import intel_extension_for_pytorch as ipex
4
+ import intel_extension_for_pytorch._C as core
5
+ import torch
6
+
7
+ OptState = ipex.cpu.autocast._grad_scaler.OptState
8
+ _MultiDeviceReplicator = ipex.cpu.autocast._grad_scaler._MultiDeviceReplicator
9
+ _refresh_per_optimizer_state = (
10
+ ipex.cpu.autocast._grad_scaler._refresh_per_optimizer_state
11
+ )
12
+
13
+
14
+ def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
15
+ per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
16
+ per_device_found_inf = _MultiDeviceReplicator(found_inf)
17
+
18
+ # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
19
+ # There could be hundreds of grads, so we'd like to iterate through them just once.
20
+ # However, we don't know their devices or dtypes in advance.
21
+
22
+ # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
23
+ # Google says mypy struggles with defaultdicts type annotations.
24
+ per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) # type: ignore[var-annotated]
25
+ # sync grad to master weight
26
+ if hasattr(optimizer, "sync_grad"):
27
+ optimizer.sync_grad()
28
+ with torch.no_grad():
29
+ for group in optimizer.param_groups:
30
+ for param in group["params"]:
31
+ if param.grad is None:
32
+ continue
33
+ if (not allow_fp16) and param.grad.dtype == torch.float16:
34
+ raise ValueError("Attempting to unscale FP16 gradients.")
35
+ if param.grad.is_sparse:
36
+ # is_coalesced() == False means the sparse grad has values with duplicate indices.
37
+ # coalesce() deduplicates indices and adds all values that have the same index.
38
+ # For scaled fp16 values, there's a good chance coalescing will cause overflow,
39
+ # so we should check the coalesced _values().
40
+ if param.grad.dtype is torch.float16:
41
+ param.grad = param.grad.coalesce()
42
+ to_unscale = param.grad._values()
43
+ else:
44
+ to_unscale = param.grad
45
+
46
+ # -: is there a way to split by device and dtype without appending in the inner loop?
47
+ to_unscale = to_unscale.to("cpu")
48
+ per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(
49
+ to_unscale
50
+ )
51
+
52
+ for _, per_dtype_grads in per_device_and_dtype_grads.items():
53
+ for grads in per_dtype_grads.values():
54
+ core._amp_foreach_non_finite_check_and_unscale_(
55
+ grads,
56
+ per_device_found_inf.get("cpu"),
57
+ per_device_inv_scale.get("cpu"),
58
+ )
59
+
60
+ return per_device_found_inf._per_device_tensors
61
+
62
+
63
+ def unscale_(self, optimizer):
64
+ """
65
+ Divides ("unscales") the optimizer's gradient tensors by the scale factor.
66
+ :meth:`unscale_` is optional, serving cases where you need to
67
+ :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
68
+ between the backward pass(es) and :meth:`step`.
69
+ If :meth:`unscale_` is not called explicitly, gradients will be unscaled automatically during :meth:`step`.
70
+ Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
71
+ ...
72
+ scaler.scale(loss).backward()
73
+ scaler.unscale_(optimizer)
74
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
75
+ scaler.step(optimizer)
76
+ scaler.update()
77
+ Args:
78
+ optimizer (torch.optim.Optimizer): Optimizer that owns the gradients to be unscaled.
79
+ .. warning::
80
+ :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
81
+ and only after all gradients for that optimizer's assigned parameters have been accumulated.
82
+ Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
83
+ .. warning::
84
+ :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
85
+ """
86
+ if not self._enabled:
87
+ return
88
+
89
+ self._check_scale_growth_tracker("unscale_")
90
+
91
+ optimizer_state = self._per_optimizer_states[id(optimizer)]
92
+
93
+ if optimizer_state["stage"] is OptState.UNSCALED:
94
+ raise RuntimeError(
95
+ "unscale_() has already been called on this optimizer since the last update()."
96
+ )
97
+ elif optimizer_state["stage"] is OptState.STEPPED:
98
+ raise RuntimeError("unscale_() is being called after step().")
99
+
100
+ # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
101
+ assert self._scale is not None
102
+ inv_scale = (
103
+ self._scale.to("cpu").double().reciprocal().float().to(self._scale.device)
104
+ )
105
+ found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
106
+
107
+ optimizer_state["found_inf_per_device"] = self._unscale_grads_(
108
+ optimizer, inv_scale, found_inf, False
109
+ )
110
+ optimizer_state["stage"] = OptState.UNSCALED
111
+
112
+
113
+ def update(self, new_scale=None):
114
+ """
115
+ Updates the scale factor.
116
+ If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
117
+ to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
118
+ the scale is multiplied by ``growth_factor`` to increase it.
119
+ Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
120
+ used directly, it's used to fill GradScaler's internal scale tensor. So if
121
+ ``new_scale`` was a tensor, later in-place changes to that tensor will not further
122
+ affect the scale GradScaler uses internally.)
123
+ Args:
124
+ new_scale (float or :class:`torch.FloatTensor`, optional, default=None): New scale factor.
125
+ .. warning::
126
+ :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
127
+ been invoked for all optimizers used this iteration.
128
+ """
129
+ if not self._enabled:
130
+ return
131
+
132
+ _scale, _growth_tracker = self._check_scale_growth_tracker("update")
133
+
134
+ if new_scale is not None:
135
+ # Accept a new user-defined scale.
136
+ if isinstance(new_scale, float):
137
+ self._scale.fill_(new_scale) # type: ignore[union-attr]
138
+ else:
139
+ reason = "new_scale should be a float or a 1-element torch.FloatTensor with requires_grad=False."
140
+ assert isinstance(new_scale, torch.FloatTensor), reason # type: ignore[attr-defined]
141
+ assert new_scale.numel() == 1, reason
142
+ assert new_scale.requires_grad is False, reason
143
+ self._scale.copy_(new_scale) # type: ignore[union-attr]
144
+ else:
145
+ # Consume shared inf/nan data collected from optimizers to update the scale.
146
+ # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
147
+ found_infs = [
148
+ found_inf.to(device="cpu", non_blocking=True)
149
+ for state in self._per_optimizer_states.values()
150
+ for found_inf in state["found_inf_per_device"].values()
151
+ ]
152
+
153
+ assert len(found_infs) > 0, "No inf checks were recorded prior to update."
154
+
155
+ found_inf_combined = found_infs[0]
156
+ if len(found_infs) > 1:
157
+ for i in range(1, len(found_infs)):
158
+ found_inf_combined += found_infs[i]
159
+
160
+ to_device = _scale.device
161
+ _scale = _scale.to("cpu")
162
+ _growth_tracker = _growth_tracker.to("cpu")
163
+
164
+ core._amp_update_scale_(
165
+ _scale,
166
+ _growth_tracker,
167
+ found_inf_combined,
168
+ self._growth_factor,
169
+ self._backoff_factor,
170
+ self._growth_interval,
171
+ )
172
+
173
+ _scale = _scale.to(to_device)
174
+ _growth_tracker = _growth_tracker.to(to_device)
175
+ # To prepare for next iteration, clear the data collected from optimizers this iteration.
176
+ self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
177
+
178
+
179
+ def gradscaler_init():
180
+ torch.xpu.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
181
+ torch.xpu.amp.GradScaler._unscale_grads_ = _unscale_grads_
182
+ torch.xpu.amp.GradScaler.unscale_ = unscale_
183
+ torch.xpu.amp.GradScaler.update = update
184
+ return torch.xpu.amp.GradScaler
rvc/lib/ipex/hijacks.py ADDED
@@ -0,0 +1,352 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import importlib
3
+
4
+ import intel_extension_for_pytorch as ipex
5
+ import torch
6
+
7
+
8
+ class CondFunc:
9
+ def __new__(cls, orig_func, sub_func, cond_func):
10
+ self = super(CondFunc, cls).__new__(cls)
11
+ if isinstance(orig_func, str):
12
+ func_path = orig_func.split(".")
13
+ for i in range(len(func_path) - 1, -1, -1):
14
+ try:
15
+ resolved_obj = importlib.import_module(".".join(func_path[:i]))
16
+ break
17
+ except ImportError:
18
+ pass
19
+ for attr_name in func_path[i:-1]:
20
+ resolved_obj = getattr(resolved_obj, attr_name)
21
+ orig_func = getattr(resolved_obj, func_path[-1])
22
+ setattr(
23
+ resolved_obj,
24
+ func_path[-1],
25
+ lambda *args, **kwargs: self(*args, **kwargs),
26
+ )
27
+ self.__init__(orig_func, sub_func, cond_func)
28
+ return lambda *args, **kwargs: self(*args, **kwargs)
29
+
30
+ def __init__(self, orig_func, sub_func, cond_func):
31
+ self.__orig_func = orig_func
32
+ self.__sub_func = sub_func
33
+ self.__cond_func = cond_func
34
+
35
+ def __call__(self, *args, **kwargs):
36
+ if not self.__cond_func or self.__cond_func(self.__orig_func, *args, **kwargs):
37
+ return self.__sub_func(self.__orig_func, *args, **kwargs)
38
+ else:
39
+ return self.__orig_func(*args, **kwargs)
40
+
41
+
42
+ _utils = torch.utils.data._utils
43
+
44
+
45
+ def _shutdown_workers(self):
46
+ if (
47
+ torch.utils.data._utils is None
48
+ or torch.utils.data._utils.python_exit_status is True
49
+ or torch.utils.data._utils.python_exit_status is None
50
+ ):
51
+ return
52
+ if hasattr(self, "_shutdown") and not self._shutdown:
53
+ self._shutdown = True
54
+ try:
55
+ if hasattr(self, "_pin_memory_thread"):
56
+ self._pin_memory_thread_done_event.set()
57
+ self._worker_result_queue.put((None, None))
58
+ self._pin_memory_thread.join()
59
+ self._worker_result_queue.cancel_join_thread()
60
+ self._worker_result_queue.close()
61
+ self._workers_done_event.set()
62
+ for worker_id in range(len(self._workers)):
63
+ if self._persistent_workers or self._workers_status[worker_id]:
64
+ self._mark_worker_as_unavailable(worker_id, shutdown=True)
65
+ for w in self._workers:
66
+ w.join(timeout=torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL)
67
+ for q in self._index_queues:
68
+ q.cancel_join_thread()
69
+ q.close()
70
+ finally:
71
+ if self._worker_pids_set:
72
+ torch.utils.data._utils.signal_handling._remove_worker_pids(id(self))
73
+ self._worker_pids_set = False
74
+ for w in self._workers:
75
+ if w.is_alive():
76
+ w.terminate()
77
+
78
+
79
+ class DummyDataParallel(torch.nn.Module):
80
+ def __new__(cls, module, device_ids=None, output_device=None, dim=0):
81
+ if isinstance(device_ids, list) and len(device_ids) > 1:
82
+ print("IPEX backend doesn't support DataParallel on multiple XPU devices")
83
+ return module.to("xpu")
84
+
85
+
86
+ def return_null_context(*args, **kwargs):
87
+ return contextlib.nullcontext()
88
+
89
+
90
+ def check_device(device):
91
+ return bool(
92
+ (isinstance(device, torch.device) and device.type == "cuda")
93
+ or (isinstance(device, str) and "cuda" in device)
94
+ or isinstance(device, int)
95
+ )
96
+
97
+
98
+ def return_xpu(device):
99
+ return (
100
+ f"xpu:{device[-1]}"
101
+ if isinstance(device, str) and ":" in device
102
+ else f"xpu:{device}"
103
+ if isinstance(device, int)
104
+ else torch.device("xpu")
105
+ if isinstance(device, torch.device)
106
+ else "xpu"
107
+ )
108
+
109
+
110
+ def ipex_no_cuda(orig_func, *args, **kwargs):
111
+ torch.cuda.is_available = lambda: False
112
+ orig_func(*args, **kwargs)
113
+ torch.cuda.is_available = torch.xpu.is_available
114
+
115
+
116
+ original_autocast = torch.autocast
117
+
118
+
119
+ def ipex_autocast(*args, **kwargs):
120
+ if len(args) > 0 and args[0] == "cuda":
121
+ return original_autocast("xpu", *args[1:], **kwargs)
122
+ else:
123
+ return original_autocast(*args, **kwargs)
124
+
125
+
126
+ original_torch_cat = torch.cat
127
+
128
+
129
+ def torch_cat(tensor, *args, **kwargs):
130
+ if len(tensor) == 3 and (
131
+ tensor[0].dtype != tensor[1].dtype or tensor[2].dtype != tensor[1].dtype
132
+ ):
133
+ return original_torch_cat(
134
+ [tensor[0].to(tensor[1].dtype), tensor[1], tensor[2].to(tensor[1].dtype)],
135
+ *args,
136
+ **kwargs,
137
+ )
138
+ else:
139
+ return original_torch_cat(tensor, *args, **kwargs)
140
+
141
+
142
+ original_interpolate = torch.nn.functional.interpolate
143
+
144
+
145
+ def interpolate(
146
+ tensor,
147
+ size=None,
148
+ scale_factor=None,
149
+ mode="nearest",
150
+ align_corners=None,
151
+ recompute_scale_factor=None,
152
+ antialias=False,
153
+ ):
154
+ if antialias or align_corners is not None:
155
+ return_device = tensor.device
156
+ return_dtype = tensor.dtype
157
+ return original_interpolate(
158
+ tensor.to("cpu", dtype=torch.float32),
159
+ size=size,
160
+ scale_factor=scale_factor,
161
+ mode=mode,
162
+ align_corners=align_corners,
163
+ recompute_scale_factor=recompute_scale_factor,
164
+ antialias=antialias,
165
+ ).to(return_device, dtype=return_dtype)
166
+ else:
167
+ return original_interpolate(
168
+ tensor,
169
+ size=size,
170
+ scale_factor=scale_factor,
171
+ mode=mode,
172
+ align_corners=align_corners,
173
+ recompute_scale_factor=recompute_scale_factor,
174
+ antialias=antialias,
175
+ )
176
+
177
+
178
+ original_linalg_solve = torch.linalg.solve
179
+
180
+
181
+ def linalg_solve(A, B, *args, **kwargs):
182
+ if A.device != torch.device("cpu") or B.device != torch.device("cpu"):
183
+ return_device = A.device
184
+ return original_linalg_solve(A.to("cpu"), B.to("cpu"), *args, **kwargs).to(
185
+ return_device
186
+ )
187
+ else:
188
+ return original_linalg_solve(A, B, *args, **kwargs)
189
+
190
+
191
+ def ipex_hijacks():
192
+ CondFunc(
193
+ "torch.Tensor.to",
194
+ lambda orig_func, self, device=None, *args, **kwargs: orig_func(
195
+ self, return_xpu(device), *args, **kwargs
196
+ ),
197
+ lambda orig_func, self, device=None, *args, **kwargs: check_device(device),
198
+ )
199
+ CondFunc(
200
+ "torch.Tensor.cuda",
201
+ lambda orig_func, self, device=None, *args, **kwargs: orig_func(
202
+ self, return_xpu(device), *args, **kwargs
203
+ ),
204
+ lambda orig_func, self, device=None, *args, **kwargs: check_device(device),
205
+ )
206
+ CondFunc(
207
+ "torch.empty",
208
+ lambda orig_func, *args, device=None, **kwargs: orig_func(
209
+ *args, device=return_xpu(device), **kwargs
210
+ ),
211
+ lambda orig_func, *args, device=None, **kwargs: check_device(device),
212
+ )
213
+ CondFunc(
214
+ "torch.load",
215
+ lambda orig_func, *args, map_location=None, **kwargs: orig_func(
216
+ *args, return_xpu(map_location), **kwargs
217
+ ),
218
+ lambda orig_func, *args, map_location=None, **kwargs: map_location is None
219
+ or check_device(map_location),
220
+ )
221
+ CondFunc(
222
+ "torch.randn",
223
+ lambda orig_func, *args, device=None, **kwargs: orig_func(
224
+ *args, device=return_xpu(device), **kwargs
225
+ ),
226
+ lambda orig_func, *args, device=None, **kwargs: check_device(device),
227
+ )
228
+ CondFunc(
229
+ "torch.ones",
230
+ lambda orig_func, *args, device=None, **kwargs: orig_func(
231
+ *args, device=return_xpu(device), **kwargs
232
+ ),
233
+ lambda orig_func, *args, device=None, **kwargs: check_device(device),
234
+ )
235
+ CondFunc(
236
+ "torch.zeros",
237
+ lambda orig_func, *args, device=None, **kwargs: orig_func(
238
+ *args, device=return_xpu(device), **kwargs
239
+ ),
240
+ lambda orig_func, *args, device=None, **kwargs: check_device(device),
241
+ )
242
+ CondFunc(
243
+ "torch.tensor",
244
+ lambda orig_func, *args, device=None, **kwargs: orig_func(
245
+ *args, device=return_xpu(device), **kwargs
246
+ ),
247
+ lambda orig_func, *args, device=None, **kwargs: check_device(device),
248
+ )
249
+ CondFunc(
250
+ "torch.linspace",
251
+ lambda orig_func, *args, device=None, **kwargs: orig_func(
252
+ *args, device=return_xpu(device), **kwargs
253
+ ),
254
+ lambda orig_func, *args, device=None, **kwargs: check_device(device),
255
+ )
256
+
257
+ CondFunc(
258
+ "torch.Generator",
259
+ lambda orig_func, device=None: torch.xpu.Generator(device),
260
+ lambda orig_func, device=None: device is not None
261
+ and device != torch.device("cpu")
262
+ and device != "cpu",
263
+ )
264
+
265
+ CondFunc(
266
+ "torch.batch_norm",
267
+ lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(
268
+ input,
269
+ weight
270
+ if weight is not None
271
+ else torch.ones(input.size()[1], device=input.device),
272
+ bias
273
+ if bias is not None
274
+ else torch.zeros(input.size()[1], device=input.device),
275
+ *args,
276
+ **kwargs,
277
+ ),
278
+ lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"),
279
+ )
280
+ CondFunc(
281
+ "torch.instance_norm",
282
+ lambda orig_func, input, weight, bias, *args, **kwargs: orig_func(
283
+ input,
284
+ weight
285
+ if weight is not None
286
+ else torch.ones(input.size()[1], device=input.device),
287
+ bias
288
+ if bias is not None
289
+ else torch.zeros(input.size()[1], device=input.device),
290
+ *args,
291
+ **kwargs,
292
+ ),
293
+ lambda orig_func, input, *args, **kwargs: input.device != torch.device("cpu"),
294
+ )
295
+
296
+ # Functions with dtype errors:
297
+ CondFunc(
298
+ "torch.nn.modules.GroupNorm.forward",
299
+ lambda orig_func, self, input: orig_func(
300
+ self, input.to(self.weight.data.dtype)
301
+ ),
302
+ lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
303
+ )
304
+ CondFunc(
305
+ "torch.nn.modules.linear.Linear.forward",
306
+ lambda orig_func, self, input: orig_func(
307
+ self, input.to(self.weight.data.dtype)
308
+ ),
309
+ lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
310
+ )
311
+ CondFunc(
312
+ "torch.nn.modules.conv.Conv2d.forward",
313
+ lambda orig_func, self, input: orig_func(
314
+ self, input.to(self.weight.data.dtype)
315
+ ),
316
+ lambda orig_func, self, input: input.dtype != self.weight.data.dtype,
317
+ )
318
+ CondFunc(
319
+ "torch.nn.functional.layer_norm",
320
+ lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: orig_func(
321
+ input.to(weight.data.dtype), normalized_shape, weight, *args, **kwargs
322
+ ),
323
+ lambda orig_func, input, normalized_shape=None, weight=None, *args, **kwargs: weight
324
+ is not None
325
+ and input.dtype != weight.data.dtype,
326
+ )
327
+
328
+ # Diffusers Float64 (ARC GPUs doesn't support double or Float64):
329
+ if not torch.xpu.has_fp64_dtype():
330
+ CondFunc(
331
+ "torch.from_numpy",
332
+ lambda orig_func, ndarray: orig_func(ndarray.astype("float32")),
333
+ lambda orig_func, ndarray: ndarray.dtype == float,
334
+ )
335
+
336
+ # Broken functions when torch.cuda.is_available is True:
337
+ CondFunc(
338
+ "torch.utils.data.dataloader._BaseDataLoaderIter.__init__",
339
+ lambda orig_func, *args, **kwargs: ipex_no_cuda(orig_func, *args, **kwargs),
340
+ lambda orig_func, *args, **kwargs: True,
341
+ )
342
+
343
+ # Functions that make compile mad with CondFunc:
344
+ torch.utils.data.dataloader._MultiProcessingDataLoaderIter._shutdown_workers = (
345
+ _shutdown_workers
346
+ )
347
+ torch.nn.DataParallel = DummyDataParallel
348
+ torch.autocast = ipex_autocast
349
+ torch.cat = torch_cat
350
+ torch.linalg.solve = linalg_solve
351
+ torch.nn.functional.interpolate = interpolate
352
+ torch.backends.cuda.sdp_kernel = return_null_context
rvc/lib/jit/__init__.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import time
3
+ from collections import OrderedDict
4
+ from io import BytesIO
5
+
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+
10
+ def load_inputs(path, device, is_half=False):
11
+ parm = torch.load(path, map_location=torch.device("cpu"))
12
+ for key in parm.keys():
13
+ parm[key] = parm[key].to(device)
14
+ if is_half and parm[key].dtype == torch.float32:
15
+ parm[key] = parm[key].half()
16
+ elif not is_half and parm[key].dtype == torch.float16:
17
+ parm[key] = parm[key].float()
18
+ return parm
19
+
20
+
21
+ def benchmark(
22
+ model, inputs_path, device=torch.device("cpu"), epoch=1000, is_half=False
23
+ ):
24
+ parm = load_inputs(inputs_path, device, is_half)
25
+ total_ts = 0.0
26
+ bar = tqdm(range(epoch))
27
+ for i in bar:
28
+ start_time = time.perf_counter()
29
+ o = model(**parm)
30
+ total_ts += time.perf_counter() - start_time
31
+ print(f"num_epoch: {epoch} | avg time(ms): {(total_ts*1000)/epoch}")
32
+
33
+
34
+ def jit_warm_up(model, inputs_path, device=torch.device("cpu"), epoch=5, is_half=False):
35
+ benchmark(model, inputs_path, device, epoch=epoch, is_half=is_half)
36
+
37
+
38
+ def to_jit_model(
39
+ model_path,
40
+ model_type: str,
41
+ mode: str = "trace",
42
+ inputs_path: str = None,
43
+ device=torch.device("cpu"),
44
+ is_half=False,
45
+ ):
46
+ model = None
47
+ if model_type.lower() == "synthesizer":
48
+ from .get_synthesizer import get_synthesizer
49
+
50
+ model, _ = get_synthesizer(model_path, device)
51
+ model.forward = model.infer
52
+ elif model_type.lower() == "rmvpe":
53
+ from .get_rmvpe import get_rmvpe
54
+
55
+ model = get_rmvpe(model_path, device)
56
+ elif model_type.lower() == "hubert":
57
+ from .get_hubert import get_hubert_model
58
+
59
+ model = get_hubert_model(model_path, device)
60
+ model.forward = model.infer
61
+ else:
62
+ raise ValueError(f"No model type named {model_type}")
63
+ model = model.eval()
64
+ model = model.half() if is_half else model.float()
65
+ if mode == "trace":
66
+ assert not inputs_path
67
+ inputs = load_inputs(inputs_path, device, is_half)
68
+ model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
69
+ elif mode == "script":
70
+ model_jit = torch.jit.script(model)
71
+ model_jit.to(device)
72
+ model_jit = model_jit.half() if is_half else model_jit.float()
73
+ # model = model.half() if is_half else model.float()
74
+ return (model, model_jit)
75
+
76
+
77
+ def export(
78
+ model: torch.nn.Module,
79
+ mode: str = "trace",
80
+ inputs: dict = None,
81
+ device=torch.device("cpu"),
82
+ is_half: bool = False,
83
+ ) -> dict:
84
+ model = model.half() if is_half else model.float()
85
+ model.eval()
86
+ if mode == "trace":
87
+ assert inputs is not None
88
+ model_jit = torch.jit.trace(model, example_kwarg_inputs=inputs)
89
+ elif mode == "script":
90
+ model_jit = torch.jit.script(model)
91
+ model_jit.to(device)
92
+ model_jit = model_jit.half() if is_half else model_jit.float()
93
+ buffer = BytesIO()
94
+ # model_jit=model_jit.cpu()
95
+ torch.jit.save(model_jit, buffer)
96
+ del model_jit
97
+ cpt = OrderedDict()
98
+ cpt["model"] = buffer.getvalue()
99
+ cpt["is_half"] = is_half
100
+ return cpt
101
+
102
+
103
+ def load(path: str):
104
+ with open(path, "rb") as f:
105
+ return pickle.load(f)
106
+
107
+
108
+ def save(ckpt: dict, save_path: str):
109
+ with open(save_path, "wb") as f:
110
+ pickle.dump(ckpt, f)
111
+
112
+
113
+ def rmvpe_jit_export(
114
+ model_path: str,
115
+ mode: str = "script",
116
+ inputs_path: str = None,
117
+ save_path: str = None,
118
+ device=torch.device("cpu"),
119
+ is_half=False,
120
+ ):
121
+ if not save_path:
122
+ save_path = model_path.rstrip(".pth")
123
+ save_path += ".half.jit" if is_half else ".jit"
124
+ if "cuda" in str(device) and ":" not in str(device):
125
+ device = torch.device("cuda:0")
126
+ from .get_rmvpe import get_rmvpe
127
+
128
+ model = get_rmvpe(model_path, device)
129
+ inputs = None
130
+ if mode == "trace":
131
+ inputs = load_inputs(inputs_path, device, is_half)
132
+ ckpt = export(model, mode, inputs, device, is_half)
133
+ ckpt["device"] = str(device)
134
+ save(ckpt, save_path)
135
+ return ckpt
136
+
137
+
138
+ def synthesizer_jit_export(
139
+ model_path: str,
140
+ mode: str = "script",
141
+ inputs_path: str = None,
142
+ save_path: str = None,
143
+ device=torch.device("cpu"),
144
+ is_half=False,
145
+ ):
146
+ if not save_path:
147
+ save_path = model_path.rstrip(".pth")
148
+ save_path += ".half.jit" if is_half else ".jit"
149
+ if "cuda" in str(device) and ":" not in str(device):
150
+ device = torch.device("cuda:0")
151
+ from .get_synthesizer import get_synthesizer
152
+
153
+ model, cpt = get_synthesizer(model_path, device)
154
+ assert isinstance(cpt, dict)
155
+ model.forward = model.infer
156
+ inputs = None
157
+ if mode == "trace":
158
+ inputs = load_inputs(inputs_path, device, is_half)
159
+ ckpt = export(model, mode, inputs, device, is_half)
160
+ cpt.pop("weight")
161
+ cpt["model"] = ckpt["model"]
162
+ cpt["device"] = device
163
+ save(cpt, save_path)
164
+ return cpt
rvc/lib/jit/get_hubert.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import random
3
+ from typing import Optional, Tuple
4
+
5
+ import numpy as np
6
+ import torch
7
+ import torch.nn.functional as F
8
+ from fairseq.checkpoint_utils import load_model_ensemble_and_task
9
+
10
+ # from fairseq.data.data_utils import compute_mask_indices
11
+ from fairseq.utils import index_put
12
+
13
+
14
+ # @torch.jit.script
15
+ def pad_to_multiple(x, multiple, dim=-1, value=0):
16
+ # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
17
+ if x is None:
18
+ return None, 0
19
+ tsz = x.size(dim)
20
+ m = tsz / multiple
21
+ remainder = math.ceil(m) * multiple - tsz
22
+ if int(tsz % multiple) == 0:
23
+ return x, 0
24
+ pad_offset = (0,) * (-1 - dim) * 2
25
+
26
+ return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder
27
+
28
+
29
+ def extract_features(
30
+ self,
31
+ x,
32
+ padding_mask=None,
33
+ tgt_layer=None,
34
+ min_layer=0,
35
+ ):
36
+ if padding_mask is not None:
37
+ x = index_put(x, padding_mask, 0)
38
+
39
+ x_conv = self.pos_conv(x.transpose(1, 2))
40
+ x_conv = x_conv.transpose(1, 2)
41
+ x = x + x_conv
42
+
43
+ if not self.layer_norm_first:
44
+ x = self.layer_norm(x)
45
+
46
+ # pad to the sequence length dimension
47
+ x, pad_length = pad_to_multiple(x, self.required_seq_len_multiple, dim=-2, value=0)
48
+ if pad_length > 0 and padding_mask is None:
49
+ padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool)
50
+ padding_mask[:, -pad_length:] = True
51
+ else:
52
+ padding_mask, _ = pad_to_multiple(
53
+ padding_mask, self.required_seq_len_multiple, dim=-1, value=True
54
+ )
55
+ x = F.dropout(x, p=self.dropout, training=self.training)
56
+
57
+ # B x T x C -> T x B x C
58
+ x = x.transpose(0, 1)
59
+
60
+ layer_results = []
61
+ r = None
62
+ for i, layer in enumerate(self.layers):
63
+ dropout_probability = np.random.random() if self.layerdrop > 0 else 1
64
+ if not self.training or (dropout_probability > self.layerdrop):
65
+ x, (z, lr) = layer(
66
+ x, self_attn_padding_mask=padding_mask, need_weights=False
67
+ )
68
+ if i >= min_layer:
69
+ layer_results.append((x, z, lr))
70
+ if i == tgt_layer:
71
+ r = x
72
+ break
73
+
74
+ if r is not None:
75
+ x = r
76
+
77
+ # T x B x C -> B x T x C
78
+ x = x.transpose(0, 1)
79
+
80
+ # undo paddding
81
+ if pad_length > 0:
82
+ x = x[:, :-pad_length]
83
+
84
+ def undo_pad(a, b, c):
85
+ return (
86
+ a[:-pad_length],
87
+ b[:-pad_length] if b is not None else b,
88
+ c[:-pad_length],
89
+ )
90
+
91
+ layer_results = [undo_pad(*u) for u in layer_results]
92
+
93
+ return x, layer_results
94
+
95
+
96
+ def compute_mask_indices(
97
+ shape: Tuple[int, int],
98
+ padding_mask: Optional[torch.Tensor],
99
+ mask_prob: float,
100
+ mask_length: int,
101
+ mask_type: str = "static",
102
+ mask_other: float = 0.0,
103
+ min_masks: int = 0,
104
+ no_overlap: bool = False,
105
+ min_space: int = 0,
106
+ require_same_masks: bool = True,
107
+ mask_dropout: float = 0.0,
108
+ ) -> torch.Tensor:
109
+ """
110
+ Computes random mask spans for a given shape
111
+
112
+ Args:
113
+ shape: the the shape for which to compute masks.
114
+ should be of size 2 where first element is batch size and 2nd is timesteps
115
+ padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
116
+ mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
117
+ number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
118
+ however due to overlaps, the actual number will be smaller (unless no_overlap is True)
119
+ mask_type: how to compute mask lengths
120
+ static = fixed size
121
+ uniform = sample from uniform distribution [mask_other, mask_length*2]
122
+ normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
123
+ poisson = sample from possion distribution with lambda = mask length
124
+ min_masks: minimum number of masked spans
125
+ no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
126
+ min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
127
+ require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
128
+ mask_dropout: randomly dropout this percentage of masks in each example
129
+ """
130
+
131
+ bsz, all_sz = shape
132
+ mask = torch.full((bsz, all_sz), False)
133
+
134
+ all_num_mask = int(
135
+ # add a random number for probabilistic rounding
136
+ mask_prob * all_sz / float(mask_length)
137
+ + torch.rand([1]).item()
138
+ )
139
+
140
+ all_num_mask = max(min_masks, all_num_mask)
141
+
142
+ mask_idcs = []
143
+ for i in range(bsz):
144
+ if padding_mask is not None:
145
+ sz = all_sz - padding_mask[i].long().sum().item()
146
+ num_mask = int(mask_prob * sz / float(mask_length) + np.random.rand())
147
+ num_mask = max(min_masks, num_mask)
148
+ else:
149
+ sz = all_sz
150
+ num_mask = all_num_mask
151
+
152
+ if mask_type == "static":
153
+ lengths = torch.full([num_mask], mask_length)
154
+ elif mask_type == "uniform":
155
+ lengths = torch.randint(mask_other, mask_length * 2 + 1, size=[num_mask])
156
+ elif mask_type == "normal":
157
+ lengths = torch.normal(mask_length, mask_other, size=[num_mask])
158
+ lengths = [max(1, int(round(x))) for x in lengths]
159
+ else:
160
+ raise Exception("unknown mask selection " + mask_type)
161
+
162
+ if sum(lengths) == 0:
163
+ lengths[0] = min(mask_length, sz - 1)
164
+
165
+ if no_overlap:
166
+ mask_idc = []
167
+
168
+ def arrange(s, e, length, keep_length):
169
+ span_start = torch.randint(low=s, high=e - length, size=[1]).item()
170
+ mask_idc.extend(span_start + i for i in range(length))
171
+
172
+ new_parts = []
173
+ if span_start - s - min_space >= keep_length:
174
+ new_parts.append((s, span_start - min_space + 1))
175
+ if e - span_start - length - min_space > keep_length:
176
+ new_parts.append((span_start + length + min_space, e))
177
+ return new_parts
178
+
179
+ parts = [(0, sz)]
180
+ min_length = min(lengths)
181
+ for length in sorted(lengths, reverse=True):
182
+ t = [e - s if e - s >= length + min_space else 0 for s, e in parts]
183
+ lens = torch.asarray(t, dtype=torch.int)
184
+ l_sum = torch.sum(lens)
185
+ if l_sum == 0:
186
+ break
187
+ probs = lens / torch.sum(lens)
188
+ c = torch.multinomial(probs.float(), len(parts)).item()
189
+ s, e = parts.pop(c)
190
+ parts.extend(arrange(s, e, length, min_length))
191
+ mask_idc = torch.asarray(mask_idc)
192
+ else:
193
+ min_len = min(lengths)
194
+ if sz - min_len <= num_mask:
195
+ min_len = sz - num_mask - 1
196
+ mask_idc = torch.asarray(
197
+ random.sample([i for i in range(sz - min_len)], num_mask)
198
+ )
199
+ mask_idc = torch.asarray(
200
+ [
201
+ mask_idc[j] + offset
202
+ for j in range(len(mask_idc))
203
+ for offset in range(lengths[j])
204
+ ]
205
+ )
206
+
207
+ mask_idcs.append(torch.unique(mask_idc[mask_idc < sz]))
208
+
209
+ min_len = min([len(m) for m in mask_idcs])
210
+ for i, mask_idc in enumerate(mask_idcs):
211
+ if isinstance(mask_idc, torch.Tensor):
212
+ mask_idc = torch.asarray(mask_idc, dtype=torch.float)
213
+ if len(mask_idc) > min_len and require_same_masks:
214
+ mask_idc = torch.asarray(
215
+ random.sample([i for i in range(mask_idc)], min_len)
216
+ )
217
+ if mask_dropout > 0:
218
+ num_holes = int(round(len(mask_idc) * mask_dropout))
219
+ mask_idc = torch.asarray(
220
+ random.sample([i for i in range(mask_idc)], len(mask_idc) - num_holes)
221
+ )
222
+
223
+ mask[i, mask_idc.int()] = True
224
+
225
+ return mask
226
+
227
+
228
+ def apply_mask(self, x, padding_mask, target_list):
229
+ B, T, C = x.shape
230
+ torch.zeros_like(x)
231
+ if self.mask_prob > 0:
232
+ mask_indices = compute_mask_indices(
233
+ (B, T),
234
+ padding_mask,
235
+ self.mask_prob,
236
+ self.mask_length,
237
+ self.mask_selection,
238
+ self.mask_other,
239
+ min_masks=2,
240
+ no_overlap=self.no_mask_overlap,
241
+ min_space=self.mask_min_space,
242
+ )
243
+ mask_indices = mask_indices.to(x.device)
244
+ x[mask_indices] = self.mask_emb
245
+ else:
246
+ mask_indices = None
247
+
248
+ if self.mask_channel_prob > 0:
249
+ mask_channel_indices = compute_mask_indices(
250
+ (B, C),
251
+ None,
252
+ self.mask_channel_prob,
253
+ self.mask_channel_length,
254
+ self.mask_channel_selection,
255
+ self.mask_channel_other,
256
+ no_overlap=self.no_mask_channel_overlap,
257
+ min_space=self.mask_channel_min_space,
258
+ )
259
+ mask_channel_indices = (
260
+ mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1)
261
+ )
262
+ x[mask_channel_indices] = 0
263
+
264
+ return x, mask_indices
265
+
266
+
267
+ def get_hubert_model(
268
+ model_path="assets/hubert/hubert_base.pt", device=torch.device("cpu")
269
+ ):
270
+ models, _, _ = load_model_ensemble_and_task(
271
+ [model_path],
272
+ suffix="",
273
+ )
274
+ hubert_model = models[0]
275
+ hubert_model = hubert_model.to(device)
276
+
277
+ def _apply_mask(x, padding_mask, target_list):
278
+ return apply_mask(hubert_model, x, padding_mask, target_list)
279
+
280
+ hubert_model.apply_mask = _apply_mask
281
+
282
+ def _extract_features(
283
+ x,
284
+ padding_mask=None,
285
+ tgt_layer=None,
286
+ min_layer=0,
287
+ ):
288
+ return extract_features(
289
+ hubert_model.encoder,
290
+ x,
291
+ padding_mask=padding_mask,
292
+ tgt_layer=tgt_layer,
293
+ min_layer=min_layer,
294
+ )
295
+
296
+ hubert_model.encoder.extract_features = _extract_features
297
+
298
+ hubert_model._forward = hubert_model.forward
299
+
300
+ def hubert_extract_features(
301
+ self,
302
+ source: torch.Tensor,
303
+ padding_mask: Optional[torch.Tensor] = None,
304
+ mask: bool = False,
305
+ ret_conv: bool = False,
306
+ output_layer: Optional[int] = None,
307
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
308
+ res = self._forward(
309
+ source,
310
+ padding_mask=padding_mask,
311
+ mask=mask,
312
+ features_only=True,
313
+ output_layer=output_layer,
314
+ )
315
+ feature = res["features"] if ret_conv else res["x"]
316
+ return feature, res["padding_mask"]
317
+
318
+ def _hubert_extract_features(
319
+ source: torch.Tensor,
320
+ padding_mask: Optional[torch.Tensor] = None,
321
+ mask: bool = False,
322
+ ret_conv: bool = False,
323
+ output_layer: Optional[int] = None,
324
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
325
+ return hubert_extract_features(
326
+ hubert_model, source, padding_mask, mask, ret_conv, output_layer
327
+ )
328
+
329
+ hubert_model.extract_features = _hubert_extract_features
330
+
331
+ def infer(source, padding_mask, output_layer: torch.Tensor):
332
+ output_layer = output_layer.item()
333
+ logits = hubert_model.extract_features(
334
+ source=source, padding_mask=padding_mask, output_layer=output_layer
335
+ )
336
+ feats = hubert_model.final_proj(logits[0]) if output_layer == 9 else logits[0]
337
+ return feats
338
+
339
+ hubert_model.infer = infer
340
+ # hubert_model.forward=infer
341
+ # hubert_model.forward
342
+
343
+ return hubert_model
rvc/lib/jit/get_rmvpe.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def get_rmvpe(model_path="assets/rmvpe/rmvpe.pt", device=torch.device("cpu")):
5
+ from infer.lib.rmvpe import E2E
6
+
7
+ model = E2E(4, 1, (2, 2))
8
+ ckpt = torch.load(model_path, map_location=device)
9
+ model.load_state_dict(ckpt)
10
+ model.eval()
11
+ model = model.to(device)
12
+ return model
rvc/lib/jit/get_synthesizer.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def get_synthesizer(pth_path, device=torch.device("cpu")):
5
+ from infer.lib.infer_pack.models import (
6
+ SynthesizerTrnMs256NSFsid,
7
+ SynthesizerTrnMs256NSFsid_nono,
8
+ SynthesizerTrnMs768NSFsid,
9
+ SynthesizerTrnMs768NSFsid_nono,
10
+ )
11
+
12
+ cpt = torch.load(pth_path, map_location=torch.device("cpu"))
13
+ # tgt_sr = cpt["config"][-1]
14
+ cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0]
15
+ if_f0 = cpt.get("f0", 1)
16
+ version = cpt.get("version", "v1")
17
+ if version == "v1":
18
+ if if_f0 == 1:
19
+ net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=False)
20
+ else:
21
+ net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"])
22
+ elif version == "v2":
23
+ if if_f0 == 1:
24
+ net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=False)
25
+ else:
26
+ net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"])
27
+ del net_g.enc_q
28
+ # net_g.forward = net_g.infer
29
+ # ckpt = {}
30
+ # ckpt["config"] = cpt["config"]
31
+ # ckpt["f0"] = if_f0
32
+ # ckpt["version"] = version
33
+ # ckpt["info"] = cpt.get("info", "0epoch")
34
+ net_g.load_state_dict(cpt["weight"], strict=False)
35
+ net_g = net_g.float()
36
+ net_g.eval().to(device)
37
+ return net_g, cpt
rvc/lib/rmvpe.py ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from io import BytesIO
3
+ from typing import List, Optional, Tuple
4
+
5
+ import numpy as np
6
+ import torch
7
+
8
+ from rvc.lib import jit
9
+
10
+ try:
11
+ # Fix "Torch not compiled with CUDA enabled"
12
+ import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
13
+
14
+ if torch.xpu.is_available():
15
+ from rvc.lib.ipex import ipex_init
16
+
17
+ ipex_init()
18
+ except Exception: # pylint: disable=broad-exception-caught
19
+ pass
20
+ import logging
21
+ from time import time as ttime
22
+
23
+ import torch.nn as nn
24
+ import torch.nn.functional as F
25
+ from librosa.filters import mel
26
+ from librosa.util import normalize, pad_center, tiny
27
+ from scipy.signal import get_window
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ class STFT(torch.nn.Module):
33
+ def __init__(
34
+ self, filter_length=1024, hop_length=512, win_length=None, window="hann"
35
+ ):
36
+ """
37
+ This module implements an STFT using 1D convolution and 1D transpose convolutions.
38
+ This is a bit tricky so there are some cases that probably won't work as working
39
+ out the same sizes before and after in all overlap add setups is tough. Right now,
40
+ this code should work with hop lengths that are half the filter length (50% overlap
41
+ between frames).
42
+
43
+ Keyword Arguments:
44
+ filter_length {int} -- Length of filters used (default: {1024})
45
+ hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512})
46
+ win_length {[type]} -- Length of the window function applied to each frame (if not specified, it
47
+ equals the filter length). (default: {None})
48
+ window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris)
49
+ (default: {'hann'})
50
+ """
51
+ super(STFT, self).__init__()
52
+ self.filter_length = filter_length
53
+ self.hop_length = hop_length
54
+ self.win_length = win_length if win_length else filter_length
55
+ self.window = window
56
+ self.forward_transform = None
57
+ self.pad_amount = int(self.filter_length / 2)
58
+ fourier_basis = np.fft.fft(np.eye(self.filter_length))
59
+
60
+ cutoff = int((self.filter_length / 2 + 1))
61
+ fourier_basis = np.vstack(
62
+ [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]
63
+ )
64
+ forward_basis = torch.FloatTensor(fourier_basis)
65
+ inverse_basis = torch.FloatTensor(np.linalg.pinv(fourier_basis))
66
+
67
+ assert filter_length >= self.win_length
68
+ # get window and zero center pad it to filter_length
69
+ fft_window = get_window(window, self.win_length, fftbins=True)
70
+ fft_window = pad_center(fft_window, size=filter_length)
71
+ fft_window = torch.from_numpy(fft_window).float()
72
+
73
+ # window the bases
74
+ forward_basis *= fft_window
75
+ inverse_basis = (inverse_basis.T * fft_window).T
76
+
77
+ self.register_buffer("forward_basis", forward_basis.float())
78
+ self.register_buffer("inverse_basis", inverse_basis.float())
79
+ self.register_buffer("fft_window", fft_window.float())
80
+
81
+ def transform(self, input_data, return_phase=False):
82
+ """Take input data (audio) to STFT domain.
83
+
84
+ Arguments:
85
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
86
+
87
+ Returns:
88
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
89
+ num_frequencies, num_frames)
90
+ phase {tensor} -- Phase of STFT with shape (num_batch,
91
+ num_frequencies, num_frames)
92
+ """
93
+ input_data = F.pad(
94
+ input_data,
95
+ (self.pad_amount, self.pad_amount),
96
+ mode="reflect",
97
+ )
98
+ forward_transform = input_data.unfold(
99
+ 1, self.filter_length, self.hop_length
100
+ ).permute(0, 2, 1)
101
+ forward_transform = torch.matmul(self.forward_basis, forward_transform)
102
+ cutoff = int((self.filter_length / 2) + 1)
103
+ real_part = forward_transform[:, :cutoff, :]
104
+ imag_part = forward_transform[:, cutoff:, :]
105
+ magnitude = torch.sqrt(real_part**2 + imag_part**2)
106
+ if return_phase:
107
+ phase = torch.atan2(imag_part.data, real_part.data)
108
+ return magnitude, phase
109
+ else:
110
+ return magnitude
111
+
112
+ def inverse(self, magnitude, phase):
113
+ """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced
114
+ by the ```transform``` function.
115
+
116
+ Arguments:
117
+ magnitude {tensor} -- Magnitude of STFT with shape (num_batch,
118
+ num_frequencies, num_frames)
119
+ phase {tensor} -- Phase of STFT with shape (num_batch,
120
+ num_frequencies, num_frames)
121
+
122
+ Returns:
123
+ inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of
124
+ shape (num_batch, num_samples)
125
+ """
126
+ cat = torch.cat(
127
+ [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1
128
+ )
129
+ fold = torch.nn.Fold(
130
+ output_size=(1, (cat.size(-1) - 1) * self.hop_length + self.filter_length),
131
+ kernel_size=(1, self.filter_length),
132
+ stride=(1, self.hop_length),
133
+ )
134
+ inverse_transform = torch.matmul(self.inverse_basis, cat)
135
+ inverse_transform = fold(inverse_transform)[
136
+ :, 0, 0, self.pad_amount : -self.pad_amount
137
+ ]
138
+ window_square_sum = (
139
+ self.fft_window.pow(2).repeat(cat.size(-1), 1).T.unsqueeze(0)
140
+ )
141
+ window_square_sum = fold(window_square_sum)[
142
+ :, 0, 0, self.pad_amount : -self.pad_amount
143
+ ]
144
+ inverse_transform /= window_square_sum
145
+ return inverse_transform
146
+
147
+ def forward(self, input_data):
148
+ """Take input data (audio) to STFT domain and then back to audio.
149
+
150
+ Arguments:
151
+ input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples)
152
+
153
+ Returns:
154
+ reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of
155
+ shape (num_batch, num_samples)
156
+ """
157
+ self.magnitude, self.phase = self.transform(input_data, return_phase=True)
158
+ reconstruction = self.inverse(self.magnitude, self.phase)
159
+ return reconstruction
160
+
161
+
162
+ class BiGRU(nn.Module):
163
+ def __init__(self, input_features, hidden_features, num_layers):
164
+ super(BiGRU, self).__init__()
165
+ self.gru = nn.GRU(
166
+ input_features,
167
+ hidden_features,
168
+ num_layers=num_layers,
169
+ batch_first=True,
170
+ bidirectional=True,
171
+ )
172
+
173
+ def forward(self, x):
174
+ return self.gru(x)[0]
175
+
176
+
177
+ class ConvBlockRes(nn.Module):
178
+ def __init__(self, in_channels, out_channels, momentum=0.01):
179
+ super(ConvBlockRes, self).__init__()
180
+ self.conv = nn.Sequential(
181
+ nn.Conv2d(
182
+ in_channels=in_channels,
183
+ out_channels=out_channels,
184
+ kernel_size=(3, 3),
185
+ stride=(1, 1),
186
+ padding=(1, 1),
187
+ bias=False,
188
+ ),
189
+ nn.BatchNorm2d(out_channels, momentum=momentum),
190
+ nn.ReLU(),
191
+ nn.Conv2d(
192
+ in_channels=out_channels,
193
+ out_channels=out_channels,
194
+ kernel_size=(3, 3),
195
+ stride=(1, 1),
196
+ padding=(1, 1),
197
+ bias=False,
198
+ ),
199
+ nn.BatchNorm2d(out_channels, momentum=momentum),
200
+ nn.ReLU(),
201
+ )
202
+ # self.shortcut:Optional[nn.Module] = None
203
+ if in_channels != out_channels:
204
+ self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
205
+
206
+ def forward(self, x: torch.Tensor):
207
+ if not hasattr(self, "shortcut"):
208
+ return self.conv(x) + x
209
+ else:
210
+ return self.conv(x) + self.shortcut(x)
211
+
212
+
213
+ class Encoder(nn.Module):
214
+ def __init__(
215
+ self,
216
+ in_channels,
217
+ in_size,
218
+ n_encoders,
219
+ kernel_size,
220
+ n_blocks,
221
+ out_channels=16,
222
+ momentum=0.01,
223
+ ):
224
+ super(Encoder, self).__init__()
225
+ self.n_encoders = n_encoders
226
+ self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
227
+ self.layers = nn.ModuleList()
228
+ self.latent_channels = []
229
+ for i in range(self.n_encoders):
230
+ self.layers.append(
231
+ ResEncoderBlock(
232
+ in_channels, out_channels, kernel_size, n_blocks, momentum=momentum
233
+ )
234
+ )
235
+ self.latent_channels.append([out_channels, in_size])
236
+ in_channels = out_channels
237
+ out_channels *= 2
238
+ in_size //= 2
239
+ self.out_size = in_size
240
+ self.out_channel = out_channels
241
+
242
+ def forward(self, x: torch.Tensor):
243
+ concat_tensors: List[torch.Tensor] = []
244
+ x = self.bn(x)
245
+ for i, layer in enumerate(self.layers):
246
+ t, x = layer(x)
247
+ concat_tensors.append(t)
248
+ return x, concat_tensors
249
+
250
+
251
+ class ResEncoderBlock(nn.Module):
252
+ def __init__(
253
+ self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01
254
+ ):
255
+ super(ResEncoderBlock, self).__init__()
256
+ self.n_blocks = n_blocks
257
+ self.conv = nn.ModuleList()
258
+ self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
259
+ for i in range(n_blocks - 1):
260
+ self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
261
+ self.kernel_size = kernel_size
262
+ if self.kernel_size is not None:
263
+ self.pool = nn.AvgPool2d(kernel_size=kernel_size)
264
+
265
+ def forward(self, x):
266
+ for i, conv in enumerate(self.conv):
267
+ x = conv(x)
268
+ if self.kernel_size is not None:
269
+ return x, self.pool(x)
270
+ else:
271
+ return x
272
+
273
+
274
+ class Intermediate(nn.Module): #
275
+ def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
276
+ super(Intermediate, self).__init__()
277
+ self.n_inters = n_inters
278
+ self.layers = nn.ModuleList()
279
+ self.layers.append(
280
+ ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum)
281
+ )
282
+ for i in range(self.n_inters - 1):
283
+ self.layers.append(
284
+ ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum)
285
+ )
286
+
287
+ def forward(self, x):
288
+ for i, layer in enumerate(self.layers):
289
+ x = layer(x)
290
+ return x
291
+
292
+
293
+ class ResDecoderBlock(nn.Module):
294
+ def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
295
+ super(ResDecoderBlock, self).__init__()
296
+ out_padding = (0, 1) if stride == (1, 2) else (1, 1)
297
+ self.n_blocks = n_blocks
298
+ self.conv1 = nn.Sequential(
299
+ nn.ConvTranspose2d(
300
+ in_channels=in_channels,
301
+ out_channels=out_channels,
302
+ kernel_size=(3, 3),
303
+ stride=stride,
304
+ padding=(1, 1),
305
+ output_padding=out_padding,
306
+ bias=False,
307
+ ),
308
+ nn.BatchNorm2d(out_channels, momentum=momentum),
309
+ nn.ReLU(),
310
+ )
311
+ self.conv2 = nn.ModuleList()
312
+ self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
313
+ for i in range(n_blocks - 1):
314
+ self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))
315
+
316
+ def forward(self, x, concat_tensor):
317
+ x = self.conv1(x)
318
+ x = torch.cat((x, concat_tensor), dim=1)
319
+ for i, conv2 in enumerate(self.conv2):
320
+ x = conv2(x)
321
+ return x
322
+
323
+
324
+ class Decoder(nn.Module):
325
+ def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
326
+ super(Decoder, self).__init__()
327
+ self.layers = nn.ModuleList()
328
+ self.n_decoders = n_decoders
329
+ for i in range(self.n_decoders):
330
+ out_channels = in_channels // 2
331
+ self.layers.append(
332
+ ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum)
333
+ )
334
+ in_channels = out_channels
335
+
336
+ def forward(self, x: torch.Tensor, concat_tensors: List[torch.Tensor]):
337
+ for i, layer in enumerate(self.layers):
338
+ x = layer(x, concat_tensors[-1 - i])
339
+ return x
340
+
341
+
342
+ class DeepUnet(nn.Module):
343
+ def __init__(
344
+ self,
345
+ kernel_size,
346
+ n_blocks,
347
+ en_de_layers=5,
348
+ inter_layers=4,
349
+ in_channels=1,
350
+ en_out_channels=16,
351
+ ):
352
+ super(DeepUnet, self).__init__()
353
+ self.encoder = Encoder(
354
+ in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels
355
+ )
356
+ self.intermediate = Intermediate(
357
+ self.encoder.out_channel // 2,
358
+ self.encoder.out_channel,
359
+ inter_layers,
360
+ n_blocks,
361
+ )
362
+ self.decoder = Decoder(
363
+ self.encoder.out_channel, en_de_layers, kernel_size, n_blocks
364
+ )
365
+
366
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
367
+ x, concat_tensors = self.encoder(x)
368
+ x = self.intermediate(x)
369
+ x = self.decoder(x, concat_tensors)
370
+ return x
371
+
372
+
373
+ class E2E(nn.Module):
374
+ def __init__(
375
+ self,
376
+ n_blocks,
377
+ n_gru,
378
+ kernel_size,
379
+ en_de_layers=5,
380
+ inter_layers=4,
381
+ in_channels=1,
382
+ en_out_channels=16,
383
+ ):
384
+ super(E2E, self).__init__()
385
+ self.unet = DeepUnet(
386
+ kernel_size,
387
+ n_blocks,
388
+ en_de_layers,
389
+ inter_layers,
390
+ in_channels,
391
+ en_out_channels,
392
+ )
393
+ self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1))
394
+ if n_gru:
395
+ self.fc = nn.Sequential(
396
+ BiGRU(3 * 128, 256, n_gru),
397
+ nn.Linear(512, 360),
398
+ nn.Dropout(0.25),
399
+ nn.Sigmoid(),
400
+ )
401
+ else:
402
+ self.fc = nn.Sequential(
403
+ nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid()
404
+ )
405
+
406
+ def forward(self, mel):
407
+ # print(mel.shape)
408
+ mel = mel.transpose(-1, -2).unsqueeze(1)
409
+ x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2)
410
+ x = self.fc(x)
411
+ # print(x.shape)
412
+ return x
413
+
414
+
415
+ class MelSpectrogram(torch.nn.Module):
416
+ def __init__(
417
+ self,
418
+ is_half,
419
+ n_mel_channels,
420
+ sampling_rate,
421
+ win_length,
422
+ hop_length,
423
+ n_fft=None,
424
+ mel_fmin=0,
425
+ mel_fmax=None,
426
+ clamp=1e-5,
427
+ ):
428
+ super().__init__()
429
+ n_fft = win_length if n_fft is None else n_fft
430
+ self.hann_window = {}
431
+ mel_basis = mel(
432
+ sr=sampling_rate,
433
+ n_fft=n_fft,
434
+ n_mels=n_mel_channels,
435
+ fmin=mel_fmin,
436
+ fmax=mel_fmax,
437
+ htk=True,
438
+ )
439
+ mel_basis = torch.from_numpy(mel_basis).float()
440
+ self.register_buffer("mel_basis", mel_basis)
441
+ self.n_fft = win_length if n_fft is None else n_fft
442
+ self.hop_length = hop_length
443
+ self.win_length = win_length
444
+ self.sampling_rate = sampling_rate
445
+ self.n_mel_channels = n_mel_channels
446
+ self.clamp = clamp
447
+ self.is_half = is_half
448
+
449
+ def forward(self, audio, keyshift=0, speed=1, center=True):
450
+ factor = 2 ** (keyshift / 12)
451
+ n_fft_new = int(np.round(self.n_fft * factor))
452
+ win_length_new = int(np.round(self.win_length * factor))
453
+ hop_length_new = int(np.round(self.hop_length * speed))
454
+ keyshift_key = str(keyshift) + "_" + str(audio.device)
455
+ if keyshift_key not in self.hann_window:
456
+ self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to(
457
+ audio.device
458
+ )
459
+ if "privateuseone" in str(audio.device):
460
+ if not hasattr(self, "stft"):
461
+ self.stft = STFT(
462
+ filter_length=n_fft_new,
463
+ hop_length=hop_length_new,
464
+ win_length=win_length_new,
465
+ window="hann",
466
+ ).to(audio.device)
467
+ magnitude = self.stft.transform(audio)
468
+ else:
469
+ fft = torch.stft(
470
+ audio,
471
+ n_fft=n_fft_new,
472
+ hop_length=hop_length_new,
473
+ win_length=win_length_new,
474
+ window=self.hann_window[keyshift_key],
475
+ center=center,
476
+ return_complex=True,
477
+ )
478
+ magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2))
479
+ if keyshift != 0:
480
+ size = self.n_fft // 2 + 1
481
+ resize = magnitude.size(1)
482
+ if resize < size:
483
+ magnitude = F.pad(magnitude, (0, 0, 0, size - resize))
484
+ magnitude = magnitude[:, :size, :] * self.win_length / win_length_new
485
+ mel_output = torch.matmul(self.mel_basis, magnitude)
486
+ if self.is_half == True:
487
+ mel_output = mel_output.half()
488
+ log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp))
489
+ return log_mel_spec
490
+
491
+
492
+ class RMVPE:
493
+ def __init__(self, model_path: str, is_half, device=None, use_jit=False):
494
+ self.resample_kernel = {}
495
+ self.resample_kernel = {}
496
+ self.is_half = is_half
497
+ if device is None:
498
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
499
+ self.device = device
500
+ self.mel_extractor = MelSpectrogram(
501
+ is_half, 128, 16000, 1024, 160, None, 30, 8000
502
+ ).to(device)
503
+ if "privateuseone" in str(device):
504
+ import onnxruntime as ort
505
+
506
+ ort_session = ort.InferenceSession(
507
+ "%s/rmvpe.onnx" % os.environ["rmvpe_root"],
508
+ providers=["DmlExecutionProvider"],
509
+ )
510
+ self.model = ort_session
511
+ else:
512
+ if str(self.device) == "cuda":
513
+ self.device = torch.device("cuda:0")
514
+
515
+ def get_jit_model():
516
+ jit_model_path = model_path.rstrip(".pth")
517
+ jit_model_path += ".half.jit" if is_half else ".jit"
518
+ reload = False
519
+ if os.path.exists(jit_model_path):
520
+ ckpt = jit.load(jit_model_path)
521
+ model_device = ckpt["device"]
522
+ if model_device != str(self.device):
523
+ reload = True
524
+ else:
525
+ reload = True
526
+
527
+ if reload:
528
+ ckpt = jit.rmvpe_jit_export(
529
+ model_path=model_path,
530
+ mode="script",
531
+ inputs_path=None,
532
+ save_path=jit_model_path,
533
+ device=device,
534
+ is_half=is_half,
535
+ )
536
+ model = torch.jit.load(BytesIO(ckpt["model"]), map_location=device)
537
+ return model
538
+
539
+ def get_default_model():
540
+ model = E2E(4, 1, (2, 2))
541
+ ckpt = torch.load(model_path, map_location="cpu")
542
+ model.load_state_dict(ckpt)
543
+ model.eval()
544
+ if is_half:
545
+ model = model.half()
546
+ else:
547
+ model = model.float()
548
+ return model
549
+
550
+ if use_jit:
551
+ if is_half and "cpu" in str(self.device):
552
+ logger.warning(
553
+ "Use default rmvpe model. \
554
+ Jit is not supported on the CPU for half floating point"
555
+ )
556
+ self.model = get_default_model()
557
+ else:
558
+ self.model = get_jit_model()
559
+ else:
560
+ self.model = get_default_model()
561
+
562
+ self.model = self.model.to(device)
563
+ cents_mapping = 20 * np.arange(360) + 1997.3794084376191
564
+ self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368
565
+
566
+ def mel2hidden(self, mel):
567
+ with torch.no_grad():
568
+ n_frames = mel.shape[-1]
569
+ n_pad = 32 * ((n_frames - 1) // 32 + 1) - n_frames
570
+ if n_pad > 0:
571
+ mel = F.pad(mel, (0, n_pad), mode="constant")
572
+ if "privateuseone" in str(self.device):
573
+ onnx_input_name = self.model.get_inputs()[0].name
574
+ onnx_outputs_names = self.model.get_outputs()[0].name
575
+ hidden = self.model.run(
576
+ [onnx_outputs_names],
577
+ input_feed={onnx_input_name: mel.cpu().numpy()},
578
+ )[0]
579
+ else:
580
+ mel = mel.half() if self.is_half else mel.float()
581
+ hidden = self.model(mel)
582
+ return hidden[:, :n_frames]
583
+
584
+ def decode(self, hidden, thred=0.03):
585
+ cents_pred = self.to_local_average_cents(hidden, thred=thred)
586
+ f0 = 10 * (2 ** (cents_pred / 1200))
587
+ f0[f0 == 10] = 0
588
+ # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred])
589
+ return f0
590
+
591
+ def infer_from_audio(self, audio, thred=0.03):
592
+ # torch.cuda.synchronize()
593
+ t0 = ttime()
594
+ mel = self.mel_extractor(
595
+ torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True
596
+ )
597
+ # print(123123123,mel.device.type)
598
+ # torch.cuda.synchronize()
599
+ t1 = ttime()
600
+ hidden = self.mel2hidden(mel)
601
+ # torch.cuda.synchronize()
602
+ t2 = ttime()
603
+ # print(234234,hidden.device.type)
604
+ if "privateuseone" not in str(self.device):
605
+ hidden = hidden.squeeze(0).cpu().numpy()
606
+ else:
607
+ hidden = hidden[0]
608
+ if self.is_half == True:
609
+ hidden = hidden.astype("float32")
610
+
611
+ f0 = self.decode(hidden, thred=thred)
612
+ # torch.cuda.synchronize()
613
+ t3 = ttime()
614
+ # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0))
615
+ return f0
616
+
617
+ def to_local_average_cents(self, salience, thred=0.05):
618
+ # t0 = ttime()
619
+ center = np.argmax(salience, axis=1) # 帧长#index
620
+ salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368
621
+ # t1 = ttime()
622
+ center += 4
623
+ todo_salience = []
624
+ todo_cents_mapping = []
625
+ starts = center - 4
626
+ ends = center + 5
627
+ for idx in range(salience.shape[0]):
628
+ todo_salience.append(salience[:, starts[idx] : ends[idx]][idx])
629
+ todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]])
630
+ # t2 = ttime()
631
+ todo_salience = np.array(todo_salience) # 帧长,9
632
+ todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9
633
+ product_sum = np.sum(todo_salience * todo_cents_mapping, 1)
634
+ weight_sum = np.sum(todo_salience, 1) # 帧长
635
+ devided = product_sum / weight_sum # 帧长
636
+ # t3 = ttime()
637
+ maxx = np.max(salience, axis=1) # 帧长
638
+ devided[maxx <= thred] = 0
639
+ # t4 = ttime()
640
+ # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3))
641
+ return devided
642
+
643
+
644
+ if __name__ == "__main__":
645
+ import librosa
646
+ import soundfile as sf
647
+
648
+ audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav")
649
+ if len(audio.shape) > 1:
650
+ audio = librosa.to_mono(audio.transpose(1, 0))
651
+ audio_bak = audio.copy()
652
+ if sampling_rate != 16000:
653
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000)
654
+ model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt"
655
+ thred = 0.03 # 0.01
656
+ device = "cuda" if torch.cuda.is_available() else "cpu"
657
+ rmvpe = RMVPE(model_path, is_half=False, device=device)
658
+ t0 = ttime()
659
+ f0 = rmvpe.infer_from_audio(audio, thred=thred)
660
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
661
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
662
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
663
+ # f0 = rmvpe.infer_from_audio(audio, thred=thred)
664
+ t1 = ttime()
665
+ logger.info("%s %.2f", f0.shape, t1 - t0)
rvc/lib/slicer2.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ # This function is obtained from librosa.
5
+ def get_rms(
6
+ y,
7
+ frame_length=2048,
8
+ hop_length=512,
9
+ pad_mode="constant",
10
+ ):
11
+ padding = (int(frame_length // 2), int(frame_length // 2))
12
+ y = np.pad(y, padding, mode=pad_mode)
13
+
14
+ axis = -1
15
+ # put our new within-frame axis at the end for now
16
+ out_strides = y.strides + tuple([y.strides[axis]])
17
+ # Reduce the shape on the framing axis
18
+ x_shape_trimmed = list(y.shape)
19
+ x_shape_trimmed[axis] -= frame_length - 1
20
+ out_shape = tuple(x_shape_trimmed) + tuple([frame_length])
21
+ xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides)
22
+ if axis < 0:
23
+ target_axis = axis - 1
24
+ else:
25
+ target_axis = axis + 1
26
+ xw = np.moveaxis(xw, -1, target_axis)
27
+ # Downsample along the target axis
28
+ slices = [slice(None)] * xw.ndim
29
+ slices[axis] = slice(0, None, hop_length)
30
+ x = xw[tuple(slices)]
31
+
32
+ # Calculate power
33
+ power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True)
34
+
35
+ return np.sqrt(power)
36
+
37
+
38
+ class Slicer:
39
+ def __init__(
40
+ self,
41
+ sr: int,
42
+ threshold: float = -40.0,
43
+ min_length: int = 5000,
44
+ min_interval: int = 300,
45
+ hop_size: int = 20,
46
+ max_sil_kept: int = 5000,
47
+ ):
48
+ if not min_length >= min_interval >= hop_size:
49
+ raise ValueError(
50
+ "The following condition must be satisfied: min_length >= min_interval >= hop_size"
51
+ )
52
+ if not max_sil_kept >= hop_size:
53
+ raise ValueError(
54
+ "The following condition must be satisfied: max_sil_kept >= hop_size"
55
+ )
56
+ min_interval = sr * min_interval / 1000
57
+ self.threshold = 10 ** (threshold / 20.0)
58
+ self.hop_size = round(sr * hop_size / 1000)
59
+ self.win_size = min(round(min_interval), 4 * self.hop_size)
60
+ self.min_length = round(sr * min_length / 1000 / self.hop_size)
61
+ self.min_interval = round(min_interval / self.hop_size)
62
+ self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size)
63
+
64
+ def _apply_slice(self, waveform, begin, end):
65
+ if len(waveform.shape) > 1:
66
+ return waveform[
67
+ :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size)
68
+ ]
69
+ else:
70
+ return waveform[
71
+ begin * self.hop_size : min(waveform.shape[0], end * self.hop_size)
72
+ ]
73
+
74
+ # @timeit
75
+ def slice(self, waveform):
76
+ if len(waveform.shape) > 1:
77
+ samples = waveform.mean(axis=0)
78
+ else:
79
+ samples = waveform
80
+ if samples.shape[0] <= self.min_length:
81
+ return [waveform]
82
+ rms_list = get_rms(
83
+ y=samples, frame_length=self.win_size, hop_length=self.hop_size
84
+ ).squeeze(0)
85
+ sil_tags = []
86
+ silence_start = None
87
+ clip_start = 0
88
+ for i, rms in enumerate(rms_list):
89
+ # Keep looping while frame is silent.
90
+ if rms < self.threshold:
91
+ # Record start of silent frames.
92
+ if silence_start is None:
93
+ silence_start = i
94
+ continue
95
+ # Keep looping while frame is not silent and silence start has not been recorded.
96
+ if silence_start is None:
97
+ continue
98
+ # Clear recorded silence start if interval is not enough or clip is too short
99
+ is_leading_silence = silence_start == 0 and i > self.max_sil_kept
100
+ need_slice_middle = (
101
+ i - silence_start >= self.min_interval
102
+ and i - clip_start >= self.min_length
103
+ )
104
+ if not is_leading_silence and not need_slice_middle:
105
+ silence_start = None
106
+ continue
107
+ # Need slicing. Record the range of silent frames to be removed.
108
+ if i - silence_start <= self.max_sil_kept:
109
+ pos = rms_list[silence_start : i + 1].argmin() + silence_start
110
+ if silence_start == 0:
111
+ sil_tags.append((0, pos))
112
+ else:
113
+ sil_tags.append((pos, pos))
114
+ clip_start = pos
115
+ elif i - silence_start <= self.max_sil_kept * 2:
116
+ pos = rms_list[
117
+ i - self.max_sil_kept : silence_start + self.max_sil_kept + 1
118
+ ].argmin()
119
+ pos += i - self.max_sil_kept
120
+ pos_l = (
121
+ rms_list[
122
+ silence_start : silence_start + self.max_sil_kept + 1
123
+ ].argmin()
124
+ + silence_start
125
+ )
126
+ pos_r = (
127
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
128
+ + i
129
+ - self.max_sil_kept
130
+ )
131
+ if silence_start == 0:
132
+ sil_tags.append((0, pos_r))
133
+ clip_start = pos_r
134
+ else:
135
+ sil_tags.append((min(pos_l, pos), max(pos_r, pos)))
136
+ clip_start = max(pos_r, pos)
137
+ else:
138
+ pos_l = (
139
+ rms_list[
140
+ silence_start : silence_start + self.max_sil_kept + 1
141
+ ].argmin()
142
+ + silence_start
143
+ )
144
+ pos_r = (
145
+ rms_list[i - self.max_sil_kept : i + 1].argmin()
146
+ + i
147
+ - self.max_sil_kept
148
+ )
149
+ if silence_start == 0:
150
+ sil_tags.append((0, pos_r))
151
+ else:
152
+ sil_tags.append((pos_l, pos_r))
153
+ clip_start = pos_r
154
+ silence_start = None
155
+ # Deal with trailing silence.
156
+ total_frames = rms_list.shape[0]
157
+ if (
158
+ silence_start is not None
159
+ and total_frames - silence_start >= self.min_interval
160
+ ):
161
+ silence_end = min(total_frames, silence_start + self.max_sil_kept)
162
+ pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start
163
+ sil_tags.append((pos, total_frames + 1))
164
+ # Apply and return slices.
165
+ if len(sil_tags) == 0:
166
+ return [waveform]
167
+ else:
168
+ chunks = []
169
+ if sil_tags[0][0] > 0:
170
+ chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0]))
171
+ for i in range(len(sil_tags) - 1):
172
+ chunks.append(
173
+ self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0])
174
+ )
175
+ if sil_tags[-1][1] < total_frames:
176
+ chunks.append(
177
+ self._apply_slice(waveform, sil_tags[-1][1], total_frames)
178
+ )
179
+ return chunks
180
+
181
+
182
+ def main():
183
+ import os.path
184
+ from argparse import ArgumentParser
185
+
186
+ import librosa
187
+ import soundfile
188
+
189
+ parser = ArgumentParser()
190
+ parser.add_argument("audio", type=str, help="The audio to be sliced")
191
+ parser.add_argument(
192
+ "--out", type=str, help="Output directory of the sliced audio clips"
193
+ )
194
+ parser.add_argument(
195
+ "--db_thresh",
196
+ type=float,
197
+ required=False,
198
+ default=-40,
199
+ help="The dB threshold for silence detection",
200
+ )
201
+ parser.add_argument(
202
+ "--min_length",
203
+ type=int,
204
+ required=False,
205
+ default=5000,
206
+ help="The minimum milliseconds required for each sliced audio clip",
207
+ )
208
+ parser.add_argument(
209
+ "--min_interval",
210
+ type=int,
211
+ required=False,
212
+ default=300,
213
+ help="The minimum milliseconds for a silence part to be sliced",
214
+ )
215
+ parser.add_argument(
216
+ "--hop_size",
217
+ type=int,
218
+ required=False,
219
+ default=10,
220
+ help="Frame length in milliseconds",
221
+ )
222
+ parser.add_argument(
223
+ "--max_sil_kept",
224
+ type=int,
225
+ required=False,
226
+ default=500,
227
+ help="The maximum silence length kept around the sliced clip, presented in milliseconds",
228
+ )
229
+ args = parser.parse_args()
230
+ out = args.out
231
+ if out is None:
232
+ out = os.path.dirname(os.path.abspath(args.audio))
233
+ audio, sr = librosa.load(args.audio, sr=None, mono=False)
234
+ slicer = Slicer(
235
+ sr=sr,
236
+ threshold=args.db_thresh,
237
+ min_length=args.min_length,
238
+ min_interval=args.min_interval,
239
+ hop_size=args.hop_size,
240
+ max_sil_kept=args.max_sil_kept,
241
+ )
242
+ chunks = slicer.slice(audio)
243
+ if not os.path.exists(out):
244
+ os.makedirs(out)
245
+ for i, chunk in enumerate(chunks):
246
+ if len(chunk.shape) > 1:
247
+ chunk = chunk.T
248
+ soundfile.write(
249
+ os.path.join(
250
+ out,
251
+ f"%s_%d.wav"
252
+ % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i),
253
+ ),
254
+ chunk,
255
+ sr,
256
+ )
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()
rvc/lib/train/architecture/v1.yml ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 32k:
2
+ filter_length: 513,
3
+ a?: 32,
4
+ inter_channels: 192,
5
+ hidden_channels: 192,
6
+ filter_channels: 768,
7
+ n_heads: 2,
8
+ kernen_layersl_size: 6,
9
+ kernel_size: 3,
10
+ p_dropout: 0,
11
+ resblock: "1",
12
+ resblock_kernel_sizes: [3, 7, 11],
13
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
14
+ upsample_rates: [10, 4, 2, 2, 2],
15
+ upsample_initial_channel: 512,
16
+ upsample_kernel_sizes: [16, 16, 4, 4, 4],
17
+ spk_embed_dim: 109,
18
+ gin_channels: 256,
19
+ sampling_rate: 32000,
20
+ 40k:
21
+ filter_length: 1025,
22
+ a?: 32, # What?
23
+ inter_channels: 192,
24
+ hidden_channels: 192,
25
+ filter_channels: 768,
26
+ n_heads: 2,
27
+ kernen_layersl_size: 6,
28
+ kernel_size: 3,
29
+ p_dropout: 0,
30
+ resblock: "1",
31
+ resblock_kernel_sizes: [3, 7, 11],
32
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
33
+ upsample_rates: [10, 10, 2, 2],
34
+ upsample_initial_channel: 512,
35
+ upsample_kernel_sizes: [16, 16, 4, 4],
36
+ spk_embed_dim: 109,
37
+ gin_channels: 256,
38
+ sampling_rate: 40000,
39
+ 48k:
40
+ filter_length: 1025,
41
+ a?: 32,
42
+ inter_channels: 192,
43
+ hidden_channels: 192,
44
+ filter_channels: 768,
45
+ n_heads: 2,
46
+ kernen_layersl_size: 6,
47
+ kernel_size: 3,
48
+ p_dropout: 0,
49
+ resblock: "1",
50
+ resblock_kernel_sizes: [3, 7, 11],
51
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
52
+ upsample_rates: [10, 6, 2, 2, 2],
53
+ upsample_initial_channel: 512,
54
+ upsample_kernel_sizes: [16, 16, 4, 4, 4],
55
+ spk_embed_dim: 109,
56
+ gin_channels: 256,
57
+ sampling_rate: 48000,
rvc/lib/train/architecture/v2.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 32k:
2
+ filter_length: 513,
3
+ a?: 32,
4
+ inter_channels: 192,
5
+ hidden_channels: 192,
6
+ filter_channels: 768,
7
+ n_heads: 2,
8
+ kernen_layersl_size: 6,
9
+ kernel_size: 3,
10
+ p_dropout: 0,
11
+ resblock: "1",
12
+ resblock_kernel_sizes: [3, 7, 11],
13
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
14
+ upsample_rates: [10, 4, 2, 2, 2],
15
+ upsample_initial_channel: 512,
16
+ upsample_kernel_sizes: [16, 16, 4, 4, 4],
17
+ spk_embed_dim: 109,
18
+ gin_channels: 256,
19
+ sampling_rate: 32000,
20
+ 48k:
21
+ filter_length: 1025,
22
+ a?: 32,
23
+ inter_channels: 192,
24
+ hidden_channels: 192,
25
+ filter_channels: 768,
26
+ n_heads: 2,
27
+ kernen_layersl_size: 6,
28
+ kernel_size: 3,
29
+ p_dropout: 0,
30
+ resblock: "1",
31
+ resblock_kernel_sizes: [3, 7, 11],
32
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
33
+ upsample_rates: [12, 10, 2, 2],
34
+ upsample_initial_channel: 512,
35
+ upsample_kernel_sizes: [24, 20, 4, 4],
36
+ spk_embed_dim: 109,
37
+ gin_channels: 256,
38
+ sampling_rate: 48000,
rvc/lib/train/data_utils.py ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import traceback
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ import numpy as np
8
+ import torch
9
+ import torch.utils.data
10
+
11
+ from rvc.lib.train.mel_processing import spectrogram_torch
12
+ from rvc.lib.train.utils import load_filepaths_and_text, load_wav_to_torch
13
+
14
+
15
+ class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset):
16
+ """
17
+ 1) loads audio, text pairs
18
+ 2) normalizes text and converts them to sequences of integers
19
+ 3) computes spectrograms from audio files.
20
+ """
21
+
22
+ def __init__(self, audiopaths_and_text, hparams):
23
+ self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
24
+ self.max_wav_value = hparams.max_wav_value
25
+ self.sampling_rate = hparams.sampling_rate
26
+ self.filter_length = hparams.filter_length
27
+ self.hop_length = hparams.hop_length
28
+ self.win_length = hparams.win_length
29
+ self.sampling_rate = hparams.sampling_rate
30
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
31
+ self.max_text_len = getattr(hparams, "max_text_len", 5000)
32
+ self._filter()
33
+
34
+ def _filter(self):
35
+ """
36
+ Filter text & store spec lengths
37
+ """
38
+ # Store spectrogram lengths for Bucketing
39
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
40
+ # spec_length = wav_length // hop_length
41
+ audiopaths_and_text_new = []
42
+ lengths = []
43
+ for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text:
44
+ if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
45
+ audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv])
46
+ lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
47
+ self.audiopaths_and_text = audiopaths_and_text_new
48
+ self.lengths = lengths
49
+
50
+ def get_sid(self, sid):
51
+ sid = torch.LongTensor([int(sid)])
52
+ return sid
53
+
54
+ def get_audio_text_pair(self, audiopath_and_text):
55
+ # separate filename and text
56
+ file = audiopath_and_text[0]
57
+ phone = audiopath_and_text[1]
58
+ pitch = audiopath_and_text[2]
59
+ pitchf = audiopath_and_text[3]
60
+ dv = audiopath_and_text[4]
61
+
62
+ phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf)
63
+ spec, wav = self.get_audio(file)
64
+ dv = self.get_sid(dv)
65
+
66
+ len_phone = phone.size()[0]
67
+ len_spec = spec.size()[-1]
68
+ # print(123,phone.shape,pitch.shape,spec.shape)
69
+ if len_phone != len_spec:
70
+ len_min = min(len_phone, len_spec)
71
+ # amor
72
+ len_wav = len_min * self.hop_length
73
+
74
+ spec = spec[:, :len_min]
75
+ wav = wav[:, :len_wav]
76
+
77
+ phone = phone[:len_min, :]
78
+ pitch = pitch[:len_min]
79
+ pitchf = pitchf[:len_min]
80
+
81
+ return (spec, wav, phone, pitch, pitchf, dv)
82
+
83
+ def get_labels(self, phone, pitch, pitchf):
84
+ phone = np.load(phone)
85
+ phone = np.repeat(phone, 2, axis=0)
86
+ pitch = np.load(pitch)
87
+ pitchf = np.load(pitchf)
88
+ n_num = min(phone.shape[0], 900) # DistributedBucketSampler
89
+ # print(234,phone.shape,pitch.shape)
90
+ phone = phone[:n_num, :]
91
+ pitch = pitch[:n_num]
92
+ pitchf = pitchf[:n_num]
93
+ phone = torch.FloatTensor(phone)
94
+ pitch = torch.LongTensor(pitch)
95
+ pitchf = torch.FloatTensor(pitchf)
96
+ return phone, pitch, pitchf
97
+
98
+ def get_audio(self, filename):
99
+ audio, sampling_rate = load_wav_to_torch(filename)
100
+ if sampling_rate != self.sampling_rate:
101
+ raise ValueError(
102
+ "{} SR doesn't match target {} SR".format(
103
+ sampling_rate, self.sampling_rate
104
+ )
105
+ )
106
+ audio_norm = audio
107
+ # audio_norm = audio / self.max_wav_value
108
+ # audio_norm = audio / np.abs(audio).max()
109
+
110
+ audio_norm = audio_norm.unsqueeze(0)
111
+ spec_filename = filename.replace(".wav", ".spec.pt")
112
+ if os.path.exists(spec_filename):
113
+ try:
114
+ spec = torch.load(spec_filename)
115
+ except:
116
+ logger.warning("%s %s", spec_filename, traceback.format_exc())
117
+ spec = spectrogram_torch(
118
+ audio_norm,
119
+ self.filter_length,
120
+ self.sampling_rate,
121
+ self.hop_length,
122
+ self.win_length,
123
+ center=False,
124
+ )
125
+ spec = torch.squeeze(spec, 0)
126
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
127
+ else:
128
+ spec = spectrogram_torch(
129
+ audio_norm,
130
+ self.filter_length,
131
+ self.sampling_rate,
132
+ self.hop_length,
133
+ self.win_length,
134
+ center=False,
135
+ )
136
+ spec = torch.squeeze(spec, 0)
137
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
138
+ return spec, audio_norm
139
+
140
+ def __getitem__(self, index):
141
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
142
+
143
+ def __len__(self):
144
+ return len(self.audiopaths_and_text)
145
+
146
+
147
+ class TextAudioCollateMultiNSFsid:
148
+ """Zero-pads model inputs and targets"""
149
+
150
+ def __init__(self, return_ids=False):
151
+ self.return_ids = return_ids
152
+
153
+ def __call__(self, batch):
154
+ """Collate's training batch from normalized text and aduio
155
+ PARAMS
156
+ ------
157
+ batch: [text_normalized, spec_normalized, wav_normalized]
158
+ """
159
+ # Right zero-pad all one-hot text sequences to max input length
160
+ _, ids_sorted_decreasing = torch.sort(
161
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
162
+ )
163
+
164
+ max_spec_len = max([x[0].size(1) for x in batch])
165
+ max_wave_len = max([x[1].size(1) for x in batch])
166
+ spec_lengths = torch.LongTensor(len(batch))
167
+ wave_lengths = torch.LongTensor(len(batch))
168
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
169
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
170
+ spec_padded.zero_()
171
+ wave_padded.zero_()
172
+
173
+ max_phone_len = max([x[2].size(0) for x in batch])
174
+ phone_lengths = torch.LongTensor(len(batch))
175
+ phone_padded = torch.FloatTensor(
176
+ len(batch), max_phone_len, batch[0][2].shape[1]
177
+ ) # (spec, wav, phone, pitch)
178
+ pitch_padded = torch.LongTensor(len(batch), max_phone_len)
179
+ pitchf_padded = torch.FloatTensor(len(batch), max_phone_len)
180
+ phone_padded.zero_()
181
+ pitch_padded.zero_()
182
+ pitchf_padded.zero_()
183
+ # dv = torch.FloatTensor(len(batch), 256)#gin=256
184
+ sid = torch.LongTensor(len(batch))
185
+
186
+ for i in range(len(ids_sorted_decreasing)):
187
+ row = batch[ids_sorted_decreasing[i]]
188
+
189
+ spec = row[0]
190
+ spec_padded[i, :, : spec.size(1)] = spec
191
+ spec_lengths[i] = spec.size(1)
192
+
193
+ wave = row[1]
194
+ wave_padded[i, :, : wave.size(1)] = wave
195
+ wave_lengths[i] = wave.size(1)
196
+
197
+ phone = row[2]
198
+ phone_padded[i, : phone.size(0), :] = phone
199
+ phone_lengths[i] = phone.size(0)
200
+
201
+ pitch = row[3]
202
+ pitch_padded[i, : pitch.size(0)] = pitch
203
+ pitchf = row[4]
204
+ pitchf_padded[i, : pitchf.size(0)] = pitchf
205
+
206
+ # dv[i] = row[5]
207
+ sid[i] = row[5]
208
+
209
+ return (
210
+ phone_padded,
211
+ phone_lengths,
212
+ pitch_padded,
213
+ pitchf_padded,
214
+ spec_padded,
215
+ spec_lengths,
216
+ wave_padded,
217
+ wave_lengths,
218
+ # dv
219
+ sid,
220
+ )
221
+
222
+
223
+ class TextAudioLoader(torch.utils.data.Dataset):
224
+ """
225
+ 1) loads audio, text pairs
226
+ 2) normalizes text and converts them to sequences of integers
227
+ 3) computes spectrograms from audio files.
228
+ """
229
+
230
+ def __init__(self, audiopaths_and_text, hparams):
231
+ self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text)
232
+ self.max_wav_value = hparams.max_wav_value
233
+ self.sampling_rate = hparams.sampling_rate
234
+ self.filter_length = hparams.filter_length
235
+ self.hop_length = hparams.hop_length
236
+ self.win_length = hparams.win_length
237
+ self.sampling_rate = hparams.sampling_rate
238
+ self.min_text_len = getattr(hparams, "min_text_len", 1)
239
+ self.max_text_len = getattr(hparams, "max_text_len", 5000)
240
+ self._filter()
241
+
242
+ def _filter(self):
243
+ """
244
+ Filter text & store spec lengths
245
+ """
246
+ # Store spectrogram lengths for Bucketing
247
+ # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2)
248
+ # spec_length = wav_length // hop_length
249
+ audiopaths_and_text_new = []
250
+ lengths = []
251
+ for audiopath, text, dv in self.audiopaths_and_text:
252
+ if self.min_text_len <= len(text) and len(text) <= self.max_text_len:
253
+ audiopaths_and_text_new.append([audiopath, text, dv])
254
+ lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length))
255
+ self.audiopaths_and_text = audiopaths_and_text_new
256
+ self.lengths = lengths
257
+
258
+ def get_sid(self, sid):
259
+ sid = torch.LongTensor([int(sid)])
260
+ return sid
261
+
262
+ def get_audio_text_pair(self, audiopath_and_text):
263
+ # separate filename and text
264
+ file = audiopath_and_text[0]
265
+ phone = audiopath_and_text[1]
266
+ dv = audiopath_and_text[2]
267
+
268
+ phone = self.get_labels(phone)
269
+ spec, wav = self.get_audio(file)
270
+ dv = self.get_sid(dv)
271
+
272
+ len_phone = phone.size()[0]
273
+ len_spec = spec.size()[-1]
274
+ if len_phone != len_spec:
275
+ len_min = min(len_phone, len_spec)
276
+ len_wav = len_min * self.hop_length
277
+ spec = spec[:, :len_min]
278
+ wav = wav[:, :len_wav]
279
+ phone = phone[:len_min, :]
280
+ return (spec, wav, phone, dv)
281
+
282
+ def get_labels(self, phone):
283
+ phone = np.load(phone)
284
+ phone = np.repeat(phone, 2, axis=0)
285
+ n_num = min(phone.shape[0], 900) # DistributedBucketSampler
286
+ phone = phone[:n_num, :]
287
+ phone = torch.FloatTensor(phone)
288
+ return phone
289
+
290
+ def get_audio(self, filename):
291
+ audio, sampling_rate = load_wav_to_torch(filename)
292
+ if sampling_rate != self.sampling_rate:
293
+ raise ValueError(
294
+ "{} SR doesn't match target {} SR".format(
295
+ sampling_rate, self.sampling_rate
296
+ )
297
+ )
298
+ audio_norm = audio
299
+ # audio_norm = audio / self.max_wav_value
300
+ # audio_norm = audio / np.abs(audio).max()
301
+
302
+ audio_norm = audio_norm.unsqueeze(0)
303
+ spec_filename = filename.replace(".wav", ".spec.pt")
304
+ if os.path.exists(spec_filename):
305
+ try:
306
+ spec = torch.load(spec_filename)
307
+ except:
308
+ logger.warning("%s %s", spec_filename, traceback.format_exc())
309
+ spec = spectrogram_torch(
310
+ audio_norm,
311
+ self.filter_length,
312
+ self.sampling_rate,
313
+ self.hop_length,
314
+ self.win_length,
315
+ center=False,
316
+ )
317
+ spec = torch.squeeze(spec, 0)
318
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
319
+ else:
320
+ spec = spectrogram_torch(
321
+ audio_norm,
322
+ self.filter_length,
323
+ self.sampling_rate,
324
+ self.hop_length,
325
+ self.win_length,
326
+ center=False,
327
+ )
328
+ spec = torch.squeeze(spec, 0)
329
+ torch.save(spec, spec_filename, _use_new_zipfile_serialization=False)
330
+ return spec, audio_norm
331
+
332
+ def __getitem__(self, index):
333
+ return self.get_audio_text_pair(self.audiopaths_and_text[index])
334
+
335
+ def __len__(self):
336
+ return len(self.audiopaths_and_text)
337
+
338
+
339
+ class TextAudioCollate:
340
+ """Zero-pads model inputs and targets"""
341
+
342
+ def __init__(self, return_ids=False):
343
+ self.return_ids = return_ids
344
+
345
+ def __call__(self, batch):
346
+ """Collate's training batch from normalized text and aduio
347
+ PARAMS
348
+ ------
349
+ batch: [text_normalized, spec_normalized, wav_normalized]
350
+ """
351
+ # Right zero-pad all one-hot text sequences to max input length
352
+ _, ids_sorted_decreasing = torch.sort(
353
+ torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True
354
+ )
355
+
356
+ max_spec_len = max([x[0].size(1) for x in batch])
357
+ max_wave_len = max([x[1].size(1) for x in batch])
358
+ spec_lengths = torch.LongTensor(len(batch))
359
+ wave_lengths = torch.LongTensor(len(batch))
360
+ spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len)
361
+ wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len)
362
+ spec_padded.zero_()
363
+ wave_padded.zero_()
364
+
365
+ max_phone_len = max([x[2].size(0) for x in batch])
366
+ phone_lengths = torch.LongTensor(len(batch))
367
+ phone_padded = torch.FloatTensor(
368
+ len(batch), max_phone_len, batch[0][2].shape[1]
369
+ )
370
+ phone_padded.zero_()
371
+ sid = torch.LongTensor(len(batch))
372
+
373
+ for i in range(len(ids_sorted_decreasing)):
374
+ row = batch[ids_sorted_decreasing[i]]
375
+
376
+ spec = row[0]
377
+ spec_padded[i, :, : spec.size(1)] = spec
378
+ spec_lengths[i] = spec.size(1)
379
+
380
+ wave = row[1]
381
+ wave_padded[i, :, : wave.size(1)] = wave
382
+ wave_lengths[i] = wave.size(1)
383
+
384
+ phone = row[2]
385
+ phone_padded[i, : phone.size(0), :] = phone
386
+ phone_lengths[i] = phone.size(0)
387
+
388
+ sid[i] = row[3]
389
+
390
+ return (
391
+ phone_padded,
392
+ phone_lengths,
393
+ spec_padded,
394
+ spec_lengths,
395
+ wave_padded,
396
+ wave_lengths,
397
+ sid,
398
+ )
399
+
400
+
401
+ class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler):
402
+ """
403
+ Maintain similar input lengths in a batch.
404
+ Length groups are specified by boundaries.
405
+ Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}.
406
+
407
+ It removes samples which are not included in the boundaries.
408
+ Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded.
409
+ """
410
+
411
+ def __init__(
412
+ self,
413
+ dataset,
414
+ batch_size,
415
+ boundaries,
416
+ num_replicas=None,
417
+ rank=None,
418
+ shuffle=True,
419
+ ):
420
+ super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle)
421
+ self.lengths = dataset.lengths
422
+ self.batch_size = batch_size
423
+ self.boundaries = boundaries
424
+
425
+ self.buckets, self.num_samples_per_bucket = self._create_buckets()
426
+ self.total_size = sum(self.num_samples_per_bucket)
427
+ self.num_samples = self.total_size // self.num_replicas
428
+
429
+ def _create_buckets(self):
430
+ buckets = [[] for _ in range(len(self.boundaries) - 1)]
431
+ for i in range(len(self.lengths)):
432
+ length = self.lengths[i]
433
+ idx_bucket = self._bisect(length)
434
+ if idx_bucket != -1:
435
+ buckets[idx_bucket].append(i)
436
+
437
+ for i in range(len(buckets) - 1, -1, -1): #
438
+ if len(buckets[i]) == 0:
439
+ buckets.pop(i)
440
+ self.boundaries.pop(i + 1)
441
+
442
+ num_samples_per_bucket = []
443
+ for i in range(len(buckets)):
444
+ len_bucket = len(buckets[i])
445
+ total_batch_size = self.num_replicas * self.batch_size
446
+ rem = (
447
+ total_batch_size - (len_bucket % total_batch_size)
448
+ ) % total_batch_size
449
+ num_samples_per_bucket.append(len_bucket + rem)
450
+ return buckets, num_samples_per_bucket
451
+
452
+ def __iter__(self):
453
+ # deterministically shuffle based on epoch
454
+ g = torch.Generator()
455
+ g.manual_seed(self.epoch)
456
+
457
+ indices = []
458
+ if self.shuffle:
459
+ for bucket in self.buckets:
460
+ indices.append(torch.randperm(len(bucket), generator=g).tolist())
461
+ else:
462
+ for bucket in self.buckets:
463
+ indices.append(list(range(len(bucket))))
464
+
465
+ batches = []
466
+ for i in range(len(self.buckets)):
467
+ bucket = self.buckets[i]
468
+ len_bucket = len(bucket)
469
+ ids_bucket = indices[i]
470
+ num_samples_bucket = self.num_samples_per_bucket[i]
471
+
472
+ # add extra samples to make it evenly divisible
473
+ rem = num_samples_bucket - len_bucket
474
+ ids_bucket = (
475
+ ids_bucket
476
+ + ids_bucket * (rem // len_bucket)
477
+ + ids_bucket[: (rem % len_bucket)]
478
+ )
479
+
480
+ # subsample
481
+ ids_bucket = ids_bucket[self.rank :: self.num_replicas]
482
+
483
+ # batching
484
+ for j in range(len(ids_bucket) // self.batch_size):
485
+ batch = [
486
+ bucket[idx]
487
+ for idx in ids_bucket[
488
+ j * self.batch_size : (j + 1) * self.batch_size
489
+ ]
490
+ ]
491
+ batches.append(batch)
492
+
493
+ if self.shuffle:
494
+ batch_ids = torch.randperm(len(batches), generator=g).tolist()
495
+ batches = [batches[i] for i in batch_ids]
496
+ self.batches = batches
497
+
498
+ assert len(self.batches) * self.batch_size == self.num_samples
499
+ return iter(self.batches)
500
+
501
+ def _bisect(self, x, lo=0, hi=None):
502
+ if hi is None:
503
+ hi = len(self.boundaries) - 1
504
+
505
+ if hi > lo:
506
+ mid = (hi + lo) // 2
507
+ if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]:
508
+ return mid
509
+ elif x <= self.boundaries[mid]:
510
+ return self._bisect(x, lo, mid)
511
+ else:
512
+ return self._bisect(x, mid + 1, hi)
513
+ else:
514
+ return -1
515
+
516
+ def __len__(self):
517
+ return self.num_samples // self.batch_size
rvc/lib/train/losses.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ def feature_loss(fmap_r, fmap_g):
5
+ loss = 0
6
+ for dr, dg in zip(fmap_r, fmap_g):
7
+ for rl, gl in zip(dr, dg):
8
+ rl = rl.float().detach()
9
+ gl = gl.float()
10
+ loss += torch.mean(torch.abs(rl - gl))
11
+
12
+ return loss * 2
13
+
14
+
15
+ def discriminator_loss(disc_real_outputs, disc_generated_outputs):
16
+ loss = 0
17
+ r_losses = []
18
+ g_losses = []
19
+ for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
20
+ dr = dr.float()
21
+ dg = dg.float()
22
+ r_loss = torch.mean((1 - dr) ** 2)
23
+ g_loss = torch.mean(dg**2)
24
+ loss += r_loss + g_loss
25
+ r_losses.append(r_loss.item())
26
+ g_losses.append(g_loss.item())
27
+
28
+ return loss, r_losses, g_losses
29
+
30
+
31
+ def generator_loss(disc_outputs):
32
+ loss = 0
33
+ gen_losses = []
34
+ for dg in disc_outputs:
35
+ dg = dg.float()
36
+ l = torch.mean((1 - dg) ** 2)
37
+ gen_losses.append(l)
38
+ loss += l
39
+
40
+ return loss, gen_losses
41
+
42
+
43
+ def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
44
+ """
45
+ z_p, logs_q: [b, h, t_t]
46
+ m_p, logs_p: [b, h, t_t]
47
+ """
48
+ z_p = z_p.float()
49
+ logs_q = logs_q.float()
50
+ m_p = m_p.float()
51
+ logs_p = logs_p.float()
52
+ z_mask = z_mask.float()
53
+
54
+ kl = logs_p - logs_q - 0.5
55
+ kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p)
56
+ kl = torch.sum(kl * z_mask)
57
+ l = kl / torch.sum(z_mask)
58
+ return l
rvc/lib/train/mel_processing.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import torch
4
+ import torch.utils.data
5
+ from librosa.filters import mel as librosa_mel_fn
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ MAX_WAV_VALUE = 32768.0
10
+
11
+
12
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
13
+ """
14
+ PARAMS
15
+ ------
16
+ C: compression factor
17
+ """
18
+ return torch.log(torch.clamp(x, min=clip_val) * C)
19
+
20
+
21
+ def dynamic_range_decompression_torch(x, C=1):
22
+ """
23
+ PARAMS
24
+ ------
25
+ C: compression factor used to compress
26
+ """
27
+ return torch.exp(x) / C
28
+
29
+
30
+ def spectral_normalize_torch(magnitudes):
31
+ return dynamic_range_compression_torch(magnitudes)
32
+
33
+
34
+ def spectral_de_normalize_torch(magnitudes):
35
+ return dynamic_range_decompression_torch(magnitudes)
36
+
37
+
38
+ # Reusable banks
39
+ mel_basis = {}
40
+ hann_window = {}
41
+
42
+
43
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
44
+ """Convert waveform into Linear-frequency Linear-amplitude spectrogram.
45
+
46
+ Args:
47
+ y :: (B, T) - Audio waveforms
48
+ n_fft
49
+ sampling_rate
50
+ hop_size
51
+ win_size
52
+ center
53
+ Returns:
54
+ :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram
55
+ """
56
+ # Validation
57
+ if torch.min(y) < -1.07:
58
+ logger.debug("min value is %s", str(torch.min(y)))
59
+ if torch.max(y) > 1.07:
60
+ logger.debug("max value is %s", str(torch.max(y)))
61
+
62
+ # Window - Cache if needed
63
+ global hann_window
64
+ dtype_device = str(y.dtype) + "_" + str(y.device)
65
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
66
+ if wnsize_dtype_device not in hann_window:
67
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
68
+ dtype=y.dtype, device=y.device
69
+ )
70
+
71
+ # Padding
72
+ y = torch.nn.functional.pad(
73
+ y.unsqueeze(1),
74
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
75
+ mode="reflect",
76
+ )
77
+ y = y.squeeze(1)
78
+
79
+ # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2)
80
+ spec = torch.stft(
81
+ y,
82
+ n_fft,
83
+ hop_length=hop_size,
84
+ win_length=win_size,
85
+ window=hann_window[wnsize_dtype_device],
86
+ center=center,
87
+ pad_mode="reflect",
88
+ normalized=False,
89
+ onesided=True,
90
+ return_complex=False,
91
+ )
92
+
93
+ # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame)
94
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
95
+ return spec
96
+
97
+
98
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
99
+ # MelBasis - Cache if needed
100
+ global mel_basis
101
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
102
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
103
+ if fmax_dtype_device not in mel_basis:
104
+ mel = librosa_mel_fn(
105
+ sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax
106
+ )
107
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
108
+ dtype=spec.dtype, device=spec.device
109
+ )
110
+
111
+ # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame)
112
+ melspec = torch.matmul(mel_basis[fmax_dtype_device], spec)
113
+ melspec = spectral_normalize_torch(melspec)
114
+ return melspec
115
+
116
+
117
+ def mel_spectrogram_torch(
118
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
119
+ ):
120
+ """Convert waveform into Mel-frequency Log-amplitude spectrogram.
121
+
122
+ Args:
123
+ y :: (B, T) - Waveforms
124
+ Returns:
125
+ melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram
126
+ """
127
+ # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame)
128
+ spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center)
129
+
130
+ # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame)
131
+ melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax)
132
+
133
+ return melspec
rvc/lib/train/process_ckpt.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import traceback
4
+ from collections import OrderedDict
5
+
6
+ import torch
7
+ from i18n.i18n import I18nAuto
8
+
9
+ i18n = I18nAuto()
10
+
11
+
12
+ def savee(ckpt, sr, if_f0, name, epoch, version, hps):
13
+ try:
14
+ opt = OrderedDict()
15
+ opt["weight"] = {}
16
+ for key in ckpt.keys():
17
+ if "enc_q" in key:
18
+ continue
19
+ opt["weight"][key] = ckpt[key].half()
20
+ opt["config"] = [
21
+ hps.data.filter_length // 2 + 1,
22
+ 32,
23
+ hps.model.inter_channels,
24
+ hps.model.hidden_channels,
25
+ hps.model.filter_channels,
26
+ hps.model.n_heads,
27
+ hps.model.n_layers,
28
+ hps.model.kernel_size,
29
+ hps.model.p_dropout,
30
+ hps.model.resblock,
31
+ hps.model.resblock_kernel_sizes,
32
+ hps.model.resblock_dilation_sizes,
33
+ hps.model.upsample_rates,
34
+ hps.model.upsample_initial_channel,
35
+ hps.model.upsample_kernel_sizes,
36
+ hps.model.spk_embed_dim,
37
+ hps.model.gin_channels,
38
+ hps.data.sampling_rate,
39
+ ]
40
+ opt["info"] = "%sepoch" % epoch
41
+ opt["sr"] = sr
42
+ opt["f0"] = if_f0
43
+ opt["version"] = version
44
+ torch.save(opt, "assets/weights/%s.pth" % name)
45
+ return "Success."
46
+ except:
47
+ return traceback.format_exc()
48
+
49
+
50
+ def show_info(path):
51
+ try:
52
+ a = torch.load(path, map_location="cpu")
53
+ return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % (
54
+ a.get("info", "None"),
55
+ a.get("sr", "None"),
56
+ a.get("f0", "None"),
57
+ a.get("version", "None"),
58
+ )
59
+ except:
60
+ return traceback.format_exc()
61
+
62
+
63
+ def extract_small_model(path, name, sr, if_f0, info, version):
64
+ try:
65
+ ckpt = torch.load(path, map_location="cpu")
66
+ if "model" in ckpt:
67
+ ckpt = ckpt["model"]
68
+ opt = OrderedDict()
69
+ opt["weight"] = {}
70
+ for key in ckpt.keys():
71
+ if "enc_q" in key:
72
+ continue
73
+ opt["weight"][key] = ckpt[key].half()
74
+ if sr == "40k":
75
+ opt["config"] = [
76
+ 1025,
77
+ 32,
78
+ 192,
79
+ 192,
80
+ 768,
81
+ 2,
82
+ 6,
83
+ 3,
84
+ 0,
85
+ "1",
86
+ [3, 7, 11],
87
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
88
+ [10, 10, 2, 2],
89
+ 512,
90
+ [16, 16, 4, 4],
91
+ 109,
92
+ 256,
93
+ 40000,
94
+ ]
95
+ elif sr == "48k":
96
+ if version == "v1":
97
+ opt["config"] = [
98
+ 1025,
99
+ 32,
100
+ 192,
101
+ 192,
102
+ 768,
103
+ 2,
104
+ 6,
105
+ 3,
106
+ 0,
107
+ "1",
108
+ [3, 7, 11],
109
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
110
+ [10, 6, 2, 2, 2],
111
+ 512,
112
+ [16, 16, 4, 4, 4],
113
+ 109,
114
+ 256,
115
+ 48000,
116
+ ]
117
+ else:
118
+ opt["config"] = [
119
+ 1025,
120
+ 32,
121
+ 192,
122
+ 192,
123
+ 768,
124
+ 2,
125
+ 6,
126
+ 3,
127
+ 0,
128
+ "1",
129
+ [3, 7, 11],
130
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
131
+ [12, 10, 2, 2],
132
+ 512,
133
+ [24, 20, 4, 4],
134
+ 109,
135
+ 256,
136
+ 48000,
137
+ ]
138
+ elif sr == "32k":
139
+ if version == "v1":
140
+ opt["config"] = [
141
+ 513,
142
+ 32,
143
+ 192,
144
+ 192,
145
+ 768,
146
+ 2,
147
+ 6,
148
+ 3,
149
+ 0,
150
+ "1",
151
+ [3, 7, 11],
152
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
153
+ [10, 4, 2, 2, 2],
154
+ 512,
155
+ [16, 16, 4, 4, 4],
156
+ 109,
157
+ 256,
158
+ 32000,
159
+ ]
160
+ else:
161
+ opt["config"] = [
162
+ 513,
163
+ 32,
164
+ 192,
165
+ 192,
166
+ 768,
167
+ 2,
168
+ 6,
169
+ 3,
170
+ 0,
171
+ "1",
172
+ [3, 7, 11],
173
+ [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
174
+ [10, 8, 2, 2],
175
+ 512,
176
+ [20, 16, 4, 4],
177
+ 109,
178
+ 256,
179
+ 32000,
180
+ ]
181
+ if info == "":
182
+ info = "Extracted model."
183
+ opt["info"] = info
184
+ opt["version"] = version
185
+ opt["sr"] = sr
186
+ opt["f0"] = int(if_f0)
187
+ torch.save(opt, "assets/weights/%s.pth" % name)
188
+ return "Success."
189
+ except:
190
+ return traceback.format_exc()
191
+
192
+
193
+ def change_info(path, info, name):
194
+ try:
195
+ ckpt = torch.load(path, map_location="cpu")
196
+ ckpt["info"] = info
197
+ if name == "":
198
+ name = os.path.basename(path)
199
+ torch.save(ckpt, "assets/weights/%s" % name)
200
+ return "Success."
201
+ except:
202
+ return traceback.format_exc()
203
+
204
+
205
+ def merge(path1, path2, alpha1, sr, f0, info, name, version):
206
+ try:
207
+
208
+ def extract(ckpt):
209
+ a = ckpt["model"]
210
+ opt = OrderedDict()
211
+ opt["weight"] = {}
212
+ for key in a.keys():
213
+ if "enc_q" in key:
214
+ continue
215
+ opt["weight"][key] = a[key]
216
+ return opt
217
+
218
+ ckpt1 = torch.load(path1, map_location="cpu")
219
+ ckpt2 = torch.load(path2, map_location="cpu")
220
+ cfg = ckpt1["config"]
221
+ if "model" in ckpt1:
222
+ ckpt1 = extract(ckpt1)
223
+ else:
224
+ ckpt1 = ckpt1["weight"]
225
+ if "model" in ckpt2:
226
+ ckpt2 = extract(ckpt2)
227
+ else:
228
+ ckpt2 = ckpt2["weight"]
229
+ if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())):
230
+ return "Fail to merge the models. The model architectures are not the same."
231
+ opt = OrderedDict()
232
+ opt["weight"] = {}
233
+ for key in ckpt1.keys():
234
+ # try:
235
+ if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape:
236
+ min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0])
237
+ opt["weight"][key] = (
238
+ alpha1 * (ckpt1[key][:min_shape0].float())
239
+ + (1 - alpha1) * (ckpt2[key][:min_shape0].float())
240
+ ).half()
241
+ else:
242
+ opt["weight"][key] = (
243
+ alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float())
244
+ ).half()
245
+ # except:
246
+ # pdb.set_trace()
247
+ opt["config"] = cfg
248
+ """
249
+ if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000]
250
+ elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000]
251
+ elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000]
252
+ """
253
+ opt["sr"] = sr
254
+ opt["f0"] = 1 if f0 == i18n("是") else 0
255
+ opt["version"] = version
256
+ opt["info"] = info
257
+ torch.save(opt, "assets/weights/%s.pth" % name)
258
+ return "Success."
259
+ except:
260
+ return traceback.format_exc()
rvc/lib/train/utils.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import json
4
+ import logging
5
+ import os
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+
10
+ import numpy as np
11
+ import torch
12
+ from scipy.io.wavfile import read
13
+
14
+ MATPLOTLIB_FLAG = False
15
+
16
+ logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
17
+ logger = logging
18
+
19
+
20
+ def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1):
21
+ assert os.path.isfile(checkpoint_path)
22
+ checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
23
+
24
+ ##################
25
+ def go(model, bkey):
26
+ saved_state_dict = checkpoint_dict[bkey]
27
+ if hasattr(model, "module"):
28
+ state_dict = model.module.state_dict()
29
+ else:
30
+ state_dict = model.state_dict()
31
+ new_state_dict = {}
32
+ for k, v in state_dict.items(): # 模型需要的shape
33
+ try:
34
+ new_state_dict[k] = saved_state_dict[k]
35
+ if saved_state_dict[k].shape != state_dict[k].shape:
36
+ logger.warning(
37
+ "shape-%s-mismatch. need: %s, get: %s",
38
+ k,
39
+ state_dict[k].shape,
40
+ saved_state_dict[k].shape,
41
+ ) #
42
+ raise KeyError
43
+ except:
44
+ # logger.info(traceback.format_exc())
45
+ logger.info("%s is not in the checkpoint", k) # pretrain缺失的
46
+ new_state_dict[k] = v # 模型自带的随机值
47
+ if hasattr(model, "module"):
48
+ model.module.load_state_dict(new_state_dict, strict=False)
49
+ else:
50
+ model.load_state_dict(new_state_dict, strict=False)
51
+ return model
52
+
53
+ go(combd, "combd")
54
+ model = go(sbd, "sbd")
55
+ #############
56
+ logger.info("Loaded model weights")
57
+
58
+ iteration = checkpoint_dict["iteration"]
59
+ learning_rate = checkpoint_dict["learning_rate"]
60
+ if (
61
+ optimizer is not None and load_opt == 1
62
+ ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch
63
+ # try:
64
+ optimizer.load_state_dict(checkpoint_dict["optimizer"])
65
+ # except:
66
+ # traceback.print_exc()
67
+ logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
68
+ return model, optimizer, learning_rate, iteration
69
+
70
+
71
+ # def load_checkpoint(checkpoint_path, model, optimizer=None):
72
+ # assert os.path.isfile(checkpoint_path)
73
+ # checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
74
+ # iteration = checkpoint_dict['iteration']
75
+ # learning_rate = checkpoint_dict['learning_rate']
76
+ # if optimizer is not None:
77
+ # optimizer.load_state_dict(checkpoint_dict['optimizer'])
78
+ # # print(1111)
79
+ # saved_state_dict = checkpoint_dict['model']
80
+ # # print(1111)
81
+ #
82
+ # if hasattr(model, 'module'):
83
+ # state_dict = model.module.state_dict()
84
+ # else:
85
+ # state_dict = model.state_dict()
86
+ # new_state_dict= {}
87
+ # for k, v in state_dict.items():
88
+ # try:
89
+ # new_state_dict[k] = saved_state_dict[k]
90
+ # except:
91
+ # logger.info("%s is not in the checkpoint" % k)
92
+ # new_state_dict[k] = v
93
+ # if hasattr(model, 'module'):
94
+ # model.module.load_state_dict(new_state_dict)
95
+ # else:
96
+ # model.load_state_dict(new_state_dict)
97
+ # logger.info("Loaded checkpoint '{}' (epoch {})" .format(
98
+ # checkpoint_path, iteration))
99
+ # return model, optimizer, learning_rate, iteration
100
+ def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1):
101
+ assert os.path.isfile(checkpoint_path)
102
+ checkpoint_dict = torch.load(checkpoint_path, map_location="cpu")
103
+
104
+ saved_state_dict = checkpoint_dict["model"]
105
+ if hasattr(model, "module"):
106
+ state_dict = model.module.state_dict()
107
+ else:
108
+ state_dict = model.state_dict()
109
+ new_state_dict = {}
110
+ for k, v in state_dict.items(): # 模型需要的shape
111
+ try:
112
+ new_state_dict[k] = saved_state_dict[k]
113
+ if saved_state_dict[k].shape != state_dict[k].shape:
114
+ logger.warning(
115
+ "shape-%s-mismatch|need-%s|get-%s",
116
+ k,
117
+ state_dict[k].shape,
118
+ saved_state_dict[k].shape,
119
+ ) #
120
+ raise KeyError
121
+ except:
122
+ # logger.info(traceback.format_exc())
123
+ logger.info("%s is not in the checkpoint", k) # pretrain缺失的
124
+ new_state_dict[k] = v # 模型自带的随机值
125
+ if hasattr(model, "module"):
126
+ model.module.load_state_dict(new_state_dict, strict=False)
127
+ else:
128
+ model.load_state_dict(new_state_dict, strict=False)
129
+ logger.info("Loaded model weights")
130
+
131
+ iteration = checkpoint_dict["iteration"]
132
+ learning_rate = checkpoint_dict["learning_rate"]
133
+ if (
134
+ optimizer is not None and load_opt == 1
135
+ ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch
136
+ # try:
137
+ optimizer.load_state_dict(checkpoint_dict["optimizer"])
138
+ # except:
139
+ # traceback.print_exc()
140
+ logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration))
141
+ return model, optimizer, learning_rate, iteration
142
+
143
+
144
+ def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path):
145
+ logger.info(
146
+ "Saving model and optimizer state at epoch {} to {}".format(
147
+ iteration, checkpoint_path
148
+ )
149
+ )
150
+ if hasattr(model, "module"):
151
+ state_dict = model.module.state_dict()
152
+ else:
153
+ state_dict = model.state_dict()
154
+ torch.save(
155
+ {
156
+ "model": state_dict,
157
+ "iteration": iteration,
158
+ "optimizer": optimizer.state_dict(),
159
+ "learning_rate": learning_rate,
160
+ },
161
+ checkpoint_path,
162
+ )
163
+
164
+
165
+ def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path):
166
+ logger.info(
167
+ "Saving model and optimizer state at epoch {} to {}".format(
168
+ iteration, checkpoint_path
169
+ )
170
+ )
171
+ if hasattr(combd, "module"):
172
+ state_dict_combd = combd.module.state_dict()
173
+ else:
174
+ state_dict_combd = combd.state_dict()
175
+ if hasattr(sbd, "module"):
176
+ state_dict_sbd = sbd.module.state_dict()
177
+ else:
178
+ state_dict_sbd = sbd.state_dict()
179
+ torch.save(
180
+ {
181
+ "combd": state_dict_combd,
182
+ "sbd": state_dict_sbd,
183
+ "iteration": iteration,
184
+ "optimizer": optimizer.state_dict(),
185
+ "learning_rate": learning_rate,
186
+ },
187
+ checkpoint_path,
188
+ )
189
+
190
+
191
+ def summarize(
192
+ writer,
193
+ global_step,
194
+ scalars={},
195
+ histograms={},
196
+ images={},
197
+ audios={},
198
+ audio_sampling_rate=22050,
199
+ ):
200
+ for k, v in scalars.items():
201
+ writer.add_scalar(k, v, global_step)
202
+ for k, v in histograms.items():
203
+ writer.add_histogram(k, v, global_step)
204
+ for k, v in images.items():
205
+ writer.add_image(k, v, global_step, dataformats="HWC")
206
+ for k, v in audios.items():
207
+ writer.add_audio(k, v, global_step, audio_sampling_rate)
208
+
209
+
210
+ def latest_checkpoint_path(dir_path, regex="G_*.pth"):
211
+ f_list = glob.glob(os.path.join(dir_path, regex))
212
+ f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
213
+ x = f_list[-1]
214
+ logger.debug(x)
215
+ return x
216
+
217
+
218
+ def plot_spectrogram_to_numpy(spectrogram):
219
+ global MATPLOTLIB_FLAG
220
+ if not MATPLOTLIB_FLAG:
221
+ import matplotlib
222
+
223
+ matplotlib.use("Agg")
224
+ MATPLOTLIB_FLAG = True
225
+ mpl_logger = logging.getLogger("matplotlib")
226
+ mpl_logger.setLevel(logging.WARNING)
227
+ import matplotlib.pylab as plt
228
+ import numpy as np
229
+
230
+ fig, ax = plt.subplots(figsize=(10, 2))
231
+ im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none")
232
+ plt.colorbar(im, ax=ax)
233
+ plt.xlabel("Frames")
234
+ plt.ylabel("Channels")
235
+ plt.tight_layout()
236
+
237
+ fig.canvas.draw()
238
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
239
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
240
+ plt.close()
241
+ return data
242
+
243
+
244
+ def plot_alignment_to_numpy(alignment, info=None):
245
+ global MATPLOTLIB_FLAG
246
+ if not MATPLOTLIB_FLAG:
247
+ import matplotlib
248
+
249
+ matplotlib.use("Agg")
250
+ MATPLOTLIB_FLAG = True
251
+ mpl_logger = logging.getLogger("matplotlib")
252
+ mpl_logger.setLevel(logging.WARNING)
253
+ import matplotlib.pylab as plt
254
+ import numpy as np
255
+
256
+ fig, ax = plt.subplots(figsize=(6, 4))
257
+ im = ax.imshow(
258
+ alignment.transpose(), aspect="auto", origin="lower", interpolation="none"
259
+ )
260
+ fig.colorbar(im, ax=ax)
261
+ xlabel = "Decoder timestep"
262
+ if info is not None:
263
+ xlabel += "\n\n" + info
264
+ plt.xlabel(xlabel)
265
+ plt.ylabel("Encoder timestep")
266
+ plt.tight_layout()
267
+
268
+ fig.canvas.draw()
269
+ data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="")
270
+ data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
271
+ plt.close()
272
+ return data
273
+
274
+
275
+ def load_wav_to_torch(full_path):
276
+ sampling_rate, data = read(full_path)
277
+ return torch.FloatTensor(data.astype(np.float32)), sampling_rate
278
+
279
+
280
+ def load_filepaths_and_text(filename, split="|"):
281
+ with open(filename, encoding="utf-8") as f:
282
+ filepaths_and_text = [line.strip().split(split) for line in f]
283
+ return filepaths_and_text
284
+
285
+
286
+ def get_hparams(init=True):
287
+ """
288
+ todo:
289
+ 结尾七人组:
290
+ 保存频率、总epoch done
291
+ bs done
292
+ pretrainG、pretrainD done
293
+ 卡号:os.en["CUDA_VISIBLE_DEVICES"] done
294
+ if_latest done
295
+ 模型:if_f0 done
296
+ 采样率:自动选择config done
297
+ 是否缓存数据集进GPU:if_cache_data_in_gpu done
298
+
299
+ -m:
300
+ 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done
301
+ -c不要了
302
+ """
303
+ parser = argparse.ArgumentParser()
304
+ parser.add_argument(
305
+ "-se",
306
+ "--save_every_epoch",
307
+ type=int,
308
+ required=True,
309
+ help="checkpoint save frequency (epoch)",
310
+ )
311
+ parser.add_argument(
312
+ "-te", "--total_epoch", type=int, required=True, help="total_epoch"
313
+ )
314
+ parser.add_argument(
315
+ "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path"
316
+ )
317
+ parser.add_argument(
318
+ "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path"
319
+ )
320
+ parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -")
321
+ parser.add_argument(
322
+ "-bs", "--batch_size", type=int, required=True, help="batch size"
323
+ )
324
+ parser.add_argument(
325
+ "-e", "--experiment_dir", type=str, required=True, help="experiment dir"
326
+ ) # -m
327
+ parser.add_argument(
328
+ "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k"
329
+ )
330
+ parser.add_argument(
331
+ "-sw",
332
+ "--save_every_weights",
333
+ type=str,
334
+ default="0",
335
+ help="save the extracted model in weights directory when saving checkpoints",
336
+ )
337
+ parser.add_argument(
338
+ "-v", "--version", type=str, required=True, help="model version"
339
+ )
340
+ parser.add_argument(
341
+ "-f0",
342
+ "--if_f0",
343
+ type=int,
344
+ required=True,
345
+ help="use f0 as one of the inputs of the model, 1 or 0",
346
+ )
347
+ parser.add_argument(
348
+ "-l",
349
+ "--if_latest",
350
+ type=int,
351
+ required=True,
352
+ help="if only save the latest G/D pth file, 1 or 0",
353
+ )
354
+ parser.add_argument(
355
+ "-c",
356
+ "--if_cache_data_in_gpu",
357
+ type=int,
358
+ required=True,
359
+ help="if caching the dataset in GPU memory, 1 or 0",
360
+ )
361
+
362
+ args = parser.parse_args()
363
+ name = args.experiment_dir
364
+ experiment_dir = os.path.join("./logs", args.experiment_dir)
365
+
366
+ config_save_path = os.path.join(experiment_dir, "config.json")
367
+ with open(config_save_path, "r") as f:
368
+ config = json.load(f)
369
+
370
+ hparams = HParams(**config)
371
+ hparams.model_dir = hparams.experiment_dir = experiment_dir
372
+ hparams.save_every_epoch = args.save_every_epoch
373
+ hparams.name = name
374
+ hparams.total_epoch = args.total_epoch
375
+ hparams.pretrainG = args.pretrainG
376
+ hparams.pretrainD = args.pretrainD
377
+ hparams.version = args.version
378
+ hparams.gpus = args.gpus
379
+ hparams.train.batch_size = args.batch_size
380
+ hparams.sample_rate = args.sample_rate
381
+ hparams.if_f0 = args.if_f0
382
+ hparams.if_latest = args.if_latest
383
+ hparams.save_every_weights = args.save_every_weights
384
+ hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu
385
+ hparams.data.training_files = "%s/filelist.txt" % experiment_dir
386
+ return hparams
387
+
388
+
389
+ def get_hparams_from_dir(model_dir):
390
+ config_save_path = os.path.join(model_dir, "config.json")
391
+ with open(config_save_path, "r") as f:
392
+ data = f.read()
393
+ config = json.loads(data)
394
+
395
+ hparams = HParams(**config)
396
+ hparams.model_dir = model_dir
397
+ return hparams
398
+
399
+
400
+ def get_hparams_from_file(config_path):
401
+ with open(config_path, "r") as f:
402
+ data = f.read()
403
+ config = json.loads(data)
404
+
405
+ hparams = HParams(**config)
406
+ return hparams
407
+
408
+
409
+ def check_git_hash(model_dir):
410
+ source_dir = os.path.dirname(os.path.realpath(__file__))
411
+ if not os.path.exists(os.path.join(source_dir, ".git")):
412
+ logger.warning(
413
+ "{} is not a git repository, therefore hash value comparison will be ignored.".format(
414
+ source_dir
415
+ )
416
+ )
417
+ return
418
+
419
+ cur_hash = subprocess.getoutput("git rev-parse HEAD")
420
+
421
+ path = os.path.join(model_dir, "githash")
422
+ if os.path.exists(path):
423
+ saved_hash = open(path).read()
424
+ if saved_hash != cur_hash:
425
+ logger.warning(
426
+ "git hash values are different. {}(saved) != {}(current)".format(
427
+ saved_hash[:8], cur_hash[:8]
428
+ )
429
+ )
430
+ else:
431
+ open(path, "w").write(cur_hash)
432
+
433
+
434
+ def get_logger(model_dir, filename="train.log"):
435
+ global logger
436
+ logger = logging.getLogger(os.path.basename(model_dir))
437
+ logger.setLevel(logging.DEBUG)
438
+
439
+ formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
440
+ if not os.path.exists(model_dir):
441
+ os.makedirs(model_dir)
442
+ h = logging.FileHandler(os.path.join(model_dir, filename))
443
+ h.setLevel(logging.DEBUG)
444
+ h.setFormatter(formatter)
445
+ logger.addHandler(h)
446
+ return logger
447
+
448
+
449
+ class HParams:
450
+ def __init__(self, **kwargs):
451
+ for k, v in kwargs.items():
452
+ if type(v) == dict:
453
+ v = HParams(**v)
454
+ self[k] = v
455
+
456
+ def keys(self):
457
+ return self.__dict__.keys()
458
+
459
+ def items(self):
460
+ return self.__dict__.items()
461
+
462
+ def values(self):
463
+ return self.__dict__.values()
464
+
465
+ def __len__(self):
466
+ return len(self.__dict__)
467
+
468
+ def __getitem__(self, key):
469
+ return getattr(self, key)
470
+
471
+ def __setitem__(self, key, value):
472
+ return setattr(self, key, value)
473
+
474
+ def __contains__(self, key):
475
+ return key in self.__dict__
476
+
477
+ def __repr__(self):
478
+ return self.__dict__.__repr__()
rvc/lib/uvr5_pack/lib_v5/dataset.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+
4
+ import numpy as np
5
+ import torch
6
+ import torch.utils.data
7
+ from tqdm import tqdm
8
+
9
+ from . import spec_utils
10
+
11
+
12
+ class VocalRemoverValidationSet(torch.utils.data.Dataset):
13
+ def __init__(self, patch_list):
14
+ self.patch_list = patch_list
15
+
16
+ def __len__(self):
17
+ return len(self.patch_list)
18
+
19
+ def __getitem__(self, idx):
20
+ path = self.patch_list[idx]
21
+ data = np.load(path)
22
+
23
+ X, y = data["X"], data["y"]
24
+
25
+ X_mag = np.abs(X)
26
+ y_mag = np.abs(y)
27
+
28
+ return X_mag, y_mag
29
+
30
+
31
+ def make_pair(mix_dir, inst_dir):
32
+ input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
33
+
34
+ X_list = sorted(
35
+ [
36
+ os.path.join(mix_dir, fname)
37
+ for fname in os.listdir(mix_dir)
38
+ if os.path.splitext(fname)[1] in input_exts
39
+ ]
40
+ )
41
+ y_list = sorted(
42
+ [
43
+ os.path.join(inst_dir, fname)
44
+ for fname in os.listdir(inst_dir)
45
+ if os.path.splitext(fname)[1] in input_exts
46
+ ]
47
+ )
48
+
49
+ filelist = list(zip(X_list, y_list))
50
+
51
+ return filelist
52
+
53
+
54
+ def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
55
+ if split_mode == "random":
56
+ filelist = make_pair(
57
+ os.path.join(dataset_dir, "mixtures"),
58
+ os.path.join(dataset_dir, "instruments"),
59
+ )
60
+
61
+ random.shuffle(filelist)
62
+
63
+ if len(val_filelist) == 0:
64
+ val_size = int(len(filelist) * val_rate)
65
+ train_filelist = filelist[:-val_size]
66
+ val_filelist = filelist[-val_size:]
67
+ else:
68
+ train_filelist = [
69
+ pair for pair in filelist if list(pair) not in val_filelist
70
+ ]
71
+ elif split_mode == "subdirs":
72
+ if len(val_filelist) != 0:
73
+ raise ValueError(
74
+ "The `val_filelist` option is not available in `subdirs` mode"
75
+ )
76
+
77
+ train_filelist = make_pair(
78
+ os.path.join(dataset_dir, "training/mixtures"),
79
+ os.path.join(dataset_dir, "training/instruments"),
80
+ )
81
+
82
+ val_filelist = make_pair(
83
+ os.path.join(dataset_dir, "validation/mixtures"),
84
+ os.path.join(dataset_dir, "validation/instruments"),
85
+ )
86
+
87
+ return train_filelist, val_filelist
88
+
89
+
90
+ def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
91
+ perm = np.random.permutation(len(X))
92
+ for i, idx in enumerate(tqdm(perm)):
93
+ if np.random.uniform() < reduction_rate:
94
+ y[idx] = spec_utils.reduce_vocal_aggressively(
95
+ X[idx], y[idx], reduction_mask
96
+ )
97
+
98
+ if np.random.uniform() < 0.5:
99
+ # swap channel
100
+ X[idx] = X[idx, ::-1]
101
+ y[idx] = y[idx, ::-1]
102
+ if np.random.uniform() < 0.02:
103
+ # mono
104
+ X[idx] = X[idx].mean(axis=0, keepdims=True)
105
+ y[idx] = y[idx].mean(axis=0, keepdims=True)
106
+ if np.random.uniform() < 0.02:
107
+ # inst
108
+ X[idx] = y[idx]
109
+
110
+ if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111
+ lam = np.random.beta(mixup_alpha, mixup_alpha)
112
+ X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113
+ y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114
+
115
+ return X, y
116
+
117
+
118
+ def make_padding(width, cropsize, offset):
119
+ left = offset
120
+ roi_size = cropsize - left * 2
121
+ if roi_size == 0:
122
+ roi_size = cropsize
123
+ right = roi_size - (width % roi_size) + left
124
+
125
+ return left, right, roi_size
126
+
127
+
128
+ def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129
+ len_dataset = patches * len(filelist)
130
+
131
+ X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132
+ y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133
+
134
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
137
+ X, y = X / coef, y / coef
138
+
139
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142
+
143
+ starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144
+ ends = starts + cropsize
145
+ for j in range(patches):
146
+ idx = i * patches + j
147
+ X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148
+ y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149
+
150
+ return X_dataset, y_dataset
151
+
152
+
153
+ def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154
+ patch_list = []
155
+ patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156
+ cropsize, sr, hop_length, n_fft, offset
157
+ )
158
+ os.makedirs(patch_dir, exist_ok=True)
159
+
160
+ for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161
+ basename = os.path.splitext(os.path.basename(X_path))[0]
162
+
163
+ X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164
+ coef = np.max([np.abs(X).max(), np.abs(y).max()])
165
+ X, y = X / coef, y / coef
166
+
167
+ l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168
+ X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169
+ y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170
+
171
+ len_dataset = int(np.ceil(X.shape[2] / roi_size))
172
+ for j in range(len_dataset):
173
+ outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174
+ start = j * roi_size
175
+ if not os.path.exists(outpath):
176
+ np.savez(
177
+ outpath,
178
+ X=X_pad[:, :, start : start + cropsize],
179
+ y=y_pad[:, :, start : start + cropsize],
180
+ )
181
+ patch_list.append(outpath)
182
+
183
+ return VocalRemoverValidationSet(patch_list)