File size: 3,189 Bytes
a1075ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
{
  "architectures": [
    "AudioVAE"
  ],
  "dec_kwargs": {
    "backbone": {
      "_attn_implementation": "flash_attention_2",
      "attention_dropout": 0.0,
      "attn_implementation": null,
      "bos_token_id": 151643,
      "eos_token_id": 151645,
      "hidden_act": "silu",
      "hidden_size": 896,
      "initializer_range": 0.02,
      "intermediate_size": 4864,
      "is_causal": true,
      "max_position_embeddings": 32768,
      "max_window_layers": 0,
      "model_type": "qwen2",
      "num_attention_heads": 14,
      "num_hidden_layers": 24,
      "num_key_value_heads": 2,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 32,
      "tie_word_embeddings": true,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.43.1",
      "use_cache": false,
      "use_sliding_window": true,
      "vocab_size": 1
    },
    "latent_dim": 64,
    "output_dim": 320
  },
  "enc_kwargs": {
    "backbone": {
      "_attn_implementation": "flash_attention_2",
      "attention_dropout": 0.0,
      "attn_implementation": null,
      "bos_token_id": 151643,
      "eos_token_id": 151645,
      "hidden_act": "silu",
      "hidden_size": 896,
      "initializer_range": 0.02,
      "intermediate_size": 4864,
      "is_causal": true,
      "max_position_embeddings": 32768,
      "max_window_layers": 0,
      "model_type": "qwen2",
      "num_attention_heads": 14,
      "num_hidden_layers": 24,
      "num_key_value_heads": 2,
      "rms_norm_eps": 1e-06,
      "rope_theta": 1000000.0,
      "sliding_window": 32,
      "tie_word_embeddings": true,
      "torch_dtype": "bfloat16",
      "transformers_version": "4.43.1",
      "use_cache": false,
      "use_sliding_window": true,
      "vocab_size": 1
    },
    "hop_size": 320,
    "input_dim": 320,
    "latent_dim": 64
  },
  "hifi_gan_disc_kwargs": {
    "channel_increasing_factor": 4,
    "channels": 16,
    "max_downsample_channels": 512,
    "periods": [
      2,
      3,
      5,
      7,
      11
    ]
  },
  "init_method": "kaiming",
  "lambda_adv": 1.0,
  "lambda_disc": 1.0,
  "lambda_feat_match_loss": 1.0,
  "lambda_mel_loss": 1.0,
  "lambda_semantic": 2.0,
  "patch_size": -1,
  "semantic_module_kwargs": {
    "causal": true,
    "whisper_encoder": {
      "n_ctx": 1500,
      "n_head": 20,
      "n_layer": 32,
      "n_mels": 128,
      "n_state": 1280
    }
  },
  "spec_disc_kwargs": {
    "channels": 32,
    "downsample_scales": [
      2,
      2,
      2
    ],
    "in_channels": 1,
    "kernel_sizes": [
      5,
      3
    ],
    "max_downsample_channels": 512,
    "out_channels": 1,
    "stft_params": {
      "fft_sizes": [
        78,
        126,
        206,
        334,
        542,
        876,
        1418,
        2296
      ],
      "hop_sizes": [
        39,
        63,
        103,
        167,
        271,
        438,
        709,
        1148
      ],
      "win_lengths": [
        78,
        126,
        206,
        334,
        542,
        876,
        1418,
        2296
      ],
      "window": "hann_window"
    },
    "use_weight_norm": true
  },
  "torch_dtype": "bfloat16",
  "transformers_version": "4.52.4"
}