speechbrain
/

soundchoice-g2p

@@ -1,533 +0,0 @@
-# Generated 2022-07-09 from:
-# /notebooks/speechbrain/recipes/LibriSpeech/G2P/hparams/hparams_g2p_rnn.yaml
-# yamllint disable
-# ################################
-# Model: LSTM (encoder) + GRU (decoder) (tokenized)
-# Authors:
-# Loren Lugosch & Mirco Ravanelli 2020
-# Artem Ploujnikov 2021
-# ################################
-# Seed needs to be set at top of yaml, before objects with parameters are made
-seed: 1234
-__set_seed: !apply:torch.manual_seed [1234]
-# Tokenizers
-char_tokenize: false
-char_token_type: unigram  # ["unigram", "bpe", "char"]
-char_token_output: 512
-char_token_wordwise: true
-phn_tokenize: false
-phn_token_type: unigram  # ["unigram", "bpe", "char"]
-phn_token_output: 512  # index(blank/eos/bos/unk) = 0
-phn_token_wordwise: true
-character_coverage: 1.0
-phonemes_count: 43
-graphemes_count: 31
-phonemes_enable_space: true
-# Training Parameters
-lexicon_epochs: 50
-lexicon_ctc_epochs: 10
-lexicon_limit_to_stop: 50                    # No stopping by default, can override
-lexicon_limit_warmup: 50                    # No stopping by default, can override
-sentence_epochs: 13
-sentence_ctc_epochs: 10
-sentence_limit_to_stop: 3
-sentence_limit_warmup: 3
-homograph_epochs: 50
-homograph_ctc_epochs: 10
-homograph_limit_to_stop: 5
-homograph_limit_warmup: 10
-lexicon_batch_size: 1024
-sentence_batch_size: 32
-homograph_batch_size: 32
-ctc_weight: 0.5
-homograph_loss_weight: 2.0
-lr: 0.002
-save_for_pretrained: true
-# Model parameters
-output_neurons: &id004 !apply:speechbrain.utils.hparams.choice
-  value: false
-  choices:
-    true: 513
-    false: 43
-enc_num_embeddings: &id005 !apply:speechbrain.utils.hparams.choice
-  value: false
-  choices:
-    true: 513
-    false: 31
-enc_dropout: 0.5
-enc_neurons: 512
-enc_num_layers: 4
-dec_dropout: 0.5
-dec_neurons: 512
-dec_att_neurons: 256
-dec_num_layers: 4
-embedding_dim: 512
-# Determines whether to use BOS (beginning-of-sequence) or EOS (end-of-sequence) tokens
-# Available modes:
-# raw: no BOS/EOS tokens are added
-# bos: a beginning-of-sequence token is added
-# eos: an end-of-sequence token is added
-grapheme_sequence_mode: bos
-phoneme_sequence_mode: bos
-# Special Token information
-bos_index: 0
-eos_index: 1
-blank_index: 2
-unk_index: 2
-token_space_index: 512
-# Language Model
-lm_emb_dim: 256 # dimension of the embeddings
-lm_rnn_size: 512 # dimension of hidden layers
-lm_layers: 2 # number of hidden layers
-lm_output_neurons: 43
-# Beam Searcher
-use_language_model: false
-beam_search_min_decode_ratio: 0
-beam_search_max_decode_ratio: 1.0
-beam_search_beam_size: 16
-beam_search_beam_size_valid: 16
-beam_search_eos_threshold: 10.0
-beam_search_using_max_attn_shift: false
-beam_search_max_attn_shift: 10
-beam_search_coverage_penalty: 5.0
-beam_search_lm_weight: 0.5
-beam_search_ctc_weight_decode: 0.4
-beam_search_temperature: 1.25
-beam_search_temperature_lm: 1.0
-# Word embeddings
-use_word_emb: true
-word_emb_model: bert-base-uncased
-word_emb_dim: 768
-word_emb_enc_dim: 256
-word_emb_norm_type: batch
-graphemes: &id028
-- A
-- B
-- C
-- D
-- E
-- F
-- G
-- H
-- I
-- J
-- K
-- L
-- M
-- N
-- O
-- P
-- Q
-- R
-- S
-- T
-- U
-- V
-- W
-- X
-- Y
-- Z
-- "'"
-- ' '
-phonemes: &id001
-- AA
-- AE
-- AH
-- AO
-- AW
-- AY
-- B
-- CH
-- D
-- DH
-- EH
-- ER
-- EY
-- F
-- G
-- HH
-- IH
-- IY
-- JH
-- K
-- L
-- M
-- N
-- NG
-- OW
-- OY
-- P
-- R
-- S
-- SH
-- T
-- TH
-- UH
-- UW
-- V
-- W
-- Y
-- Z
-- ZH
-- ' '
-enc_input_dim: &id003 !apply:speechbrain.lobes.models.g2p.model.input_dim
-  use_word_emb: true
-  word_emb_enc_dim: 256
-  embedding_dim: 512
-phn_char_map: &id002 !apply:speechbrain.lobes.models.g2p.dataio.build_token_char_map
-# Models
-  tokens: *id001
-char_phn_map: &id023 !apply:speechbrain.lobes.models.g2p.dataio.flip_map
-  map_dict: *id002
-enc: &id006 !new:speechbrain.nnet.RNN.LSTM
-  input_shape: [null, null, *id003]
-  bidirectional: true
-  hidden_size: 512
-  num_layers: 4
-  dropout: 0.5
-lin: &id010 !new:speechbrain.nnet.linear.Linear
-  input_size: 512
-  n_neurons: *id004
-  bias: false
-ctc_lin: &id013 !new:speechbrain.nnet.linear.Linear
-  input_size: 1024
-  n_neurons: *id004
-encoder_emb: &id007 !new:speechbrain.nnet.embedding.Embedding
-  num_embeddings: *id005
-  embedding_dim: 512
-emb: &id008 !new:speechbrain.nnet.embedding.Embedding
-  num_embeddings: *id004
-  embedding_dim: 512
-dec: &id009 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
-  enc_dim: 1024
-  input_size: 512
-  rnn_type: gru
-  attn_type: content
-  dropout: 0.5
-  hidden_size: 512
-  attn_dim: 256
-  num_layers: 4
-word_emb_enc: &id012 !new:speechbrain.lobes.models.g2p.model.WordEmbeddingEncoder
-  word_emb_dim: 768
-  word_emb_enc_dim: 256
-  norm_type: batch
-word_emb: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
-  init: !name:speechbrain.wordemb.transformer.TransformerWordEmbeddings
-    model: bert-base-uncased
-log_softmax: &id011 !new:speechbrain.nnet.activations.Softmax
-  apply_log: true
-modules:
-  model: &id014 !new:speechbrain.lobes.models.g2p.model.AttentionSeq2Seq
-    enc: *id006
-    encoder_emb: *id007
-    emb: *id008
-    dec: *id009
-    lin: *id010
-    out: *id011
-    use_word_emb: true
-    word_emb_enc: *id012
-  enc: *id006
-  encoder_emb: *id007
-  emb: *id008
-  dec: *id009
-  lin: *id010
-  ctc_lin: *id013
-  out: *id011
-  word_emb:
-  word_emb_enc: *id012
-model: *id014
-lm_model: &id015 !new:speechbrain.lobes.models.RNNLM.RNNLM
-  embedding_dim: 256
-  rnn_layers: 2
-  rnn_neurons: 512
-  output_neurons: 43
-  return_hidden: true
-opt_class: !name:torch.optim.Adam
-  lr: 0.002
-# Scorer
-ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
-    eos_index: 1
-    blank_index: 2
-    ctc_fc: *id013
-rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
-   language_model: !ref <lm_model>
-   temperature: 1.0
-coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
-   vocab_size: !ref <output_neurons>
-scorer_train: !new:speechbrain.decoders.scorer.ScorerBuilder
-   full_scorers: [!ref <coverage_scorer>,
-                  !ref <ctc_scorer>]
-   weights:
-      coverage: 5.0
-      ctc: 0.4
-beam_searcher: &id029 !new:speechbrain.decoders.S2SRNNBeamSearcher
-  embedding: *id008
-  decoder: *id009
-  linear: *id010
-  bos_index: 0
-  eos_index: 1
-  min_decode_ratio: 0
-  max_decode_ratio: 1.0
-  beam_size: 16
-  eos_threshold: 10.0
-  using_max_attn_shift: false
-  max_attn_shift: 10
-  scorer: !ref <scorer_train>
-scorer_valid: !new:speechbrain.decoders.scorer.ScorerBuilder
-   full_scorers: [!ref <coverage_scorer>,
-                  !ref <ctc_scorer>]
-   weights:
-      coverage: 5.0
-      ctc: 0.
-beam_searcher_valid: !new:speechbrain.decoders.S2SRNNBeamSearcher
-  embedding: *id008
-  decoder: *id009
-  linear: *id010
-  bos_index: 0
-  eos_index: 1
-  min_decode_ratio: 0
-  max_decode_ratio: 1.0
-  beam_size: 16
-  eos_threshold: 10.0
-  using_max_attn_shift: false
-  max_attn_shift: 10
-  scorer: !ref <scorer_valid>
-scorer_test: !new:speechbrain.decoders.scorer.ScorerBuilder
-   full_scorers: [!ref <coverage_scorer>,
-                  !ref <rnnlm_scorer>,
-                  !ref <ctc_scorer>]
-   weights:
-      coverage: 5.0
-      rnnlm: 0.5
-      ctc: 0.4
-beam_searcher_lm: !new:speechbrain.decoders.seq2seq.S2SRNNBeamSearcher
-  embedding: *id008
-  decoder: *id009
-  linear: *id010
-  bos_index: 0
-  eos_index: 1
-  min_decode_ratio: 0
-  max_decode_ratio: 1.0
-  beam_size: 16
-  eos_threshold: 10.0
-  using_max_attn_shift: false
-  max_attn_shift: 10
-  temperature: 1.25
-  scorer: !ref <scorer_test>
-lr_annealing: &id018 !new:speechbrain.nnet.schedulers.NewBobScheduler
-  initial_value: 0.002
-  improvement_threshold: 0.0
-  annealing_factor: 0.8
-  patient: 0
-homograph_extractor: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceExtractor
-seq_cost: &id016 !name:speechbrain.nnet.losses.nll_loss
-  label_smoothing: 0.1
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-  blank_index: 2
-seq_cost_metric: &id017 !name:speechbrain.nnet.losses.nll_loss
-  label_smoothing: 0.1
-  reduction: batch
-homograph_cost: !new:speechbrain.lobes.models.g2p.homograph.SubsequenceLoss
-  seq_cost: *id016
-seq_stats: !name:speechbrain.utils.metric_stats.MetricStats
-  metric: *id017
-seq_stats_homograph: !name:speechbrain.utils.metric_stats.MetricStats
-  metric: *id017
-classification_stats_homograph: !name:speechbrain.utils.metric_stats.ClassificationStats
-per_stats: !name:speechbrain.utils.metric_stats.ErrorRateStats
-per_stats_homograph: !name:speechbrain.utils.metric_stats.ErrorRateStats
-model_output_keys:
-- p_seq
-- char_lens
-- encoder_out
-grapheme_encoder: &id027 !new:speechbrain.dataio.encoder.TextEncoder
-phoneme_encoder: &id024 !new:speechbrain.dataio.encoder.TextEncoder
-grapheme_tokenizer: !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
-  init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
-    model_dir: grapheme_tokenizer
-    bos_id: 0
-    eos_id: 1
-    unk_id: 2
-    vocab_size: 512
-    annotation_train: tokenizer_annotation_train.json
-    annotation_read: char
-    model_type: unigram                    # ["unigram", "bpe", "char"]
-    character_coverage: 1.0
-    annotation_format: json
-    text_file: grapheme_annotations.txt
-phoneme_tokenizer: &id022 !apply:speechbrain.lobes.models.g2p.dataio.lazy_init
-  init: !name:speechbrain.tokenizers.SentencePiece.SentencePiece
-    model_dir: phoneme_tokenizer
-    bos_id: 0
-    eos_id: 1
-    unk_id: 2
-    vocab_size: 512
-    annotation_train: tokenizer_annotation_train.json
-    annotation_read: phn
-    model_type: unigram                   # ["unigram", "bpe", "char"]
-    character_coverage: 1.0
-    annotation_list_to_check: [tokenizer_annotation_valid.json]
-    annotation_format: json
-    text_file: phoneme_annotations.txt
-out_phoneme_decoder_tok: &id025 !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
-  tokenizer: *id022
-  char_map: *id023
-  token_space_index: 512
-  wordwise: true
-out_phoneme_decoder_raw: &id026 !name:speechbrain.lobes.models.g2p.dataio.text_decode
-  encoder: *id024
-out_phoneme_decoder: !apply:speechbrain.utils.hparams.choice
-  value: false
-  choices:
-    true: *id025
-    false: *id026
-encode_pipeline:
-  batch: false
-  use_padded_data: true
-  output_keys:
-  - grapheme_list
-  - grapheme_encoded_list
-  - grapheme_encoded
-  - word_emb
-  init:
-  - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
-      encoder: *id027
-      tokens: *id028
-      bos_index: 0
-      eos_index: 1
-  - func: !name:speechbrain.lobes.models.g2p.dataio.enable_eos_bos
-      encoder: *id024
-      tokens: *id001
-      bos_index: 0
-      eos_index: 1
-  steps:
-  - func: !name:speechbrain.lobes.models.g2p.dataio.clean_pipeline
-      graphemes: *id028
-    takes: txt
-    provides: txt_cleaned
-  - func: !name:speechbrain.lobes.models.g2p.dataio.grapheme_pipeline
-      grapheme_encoder: *id027
-    takes: txt_cleaned
-    provides:
-    - grapheme_list
-    - grapheme_encoded_list
-    - grapheme_encoded_raw
-  - func: !name:speechbrain.lobes.models.g2p.dataio.add_bos_eos
-      encoder: *id027
-    takes: grapheme_encoded_list
-    provides:
-    - grapheme_encoded
-    - grapheme_len
-    - grapheme_encoded_eos
-    - grapheme_len_eos
-  - func: !name:speechbrain.lobes.models.g2p.dataio.word_emb_pipeline
-      word_emb: !ref <word_emb>
-      grapheme_encoder: !ref <grapheme_encoder>
-      use_word_emb: !ref <use_word_emb>
-    takes:
-    - txt
-    - grapheme_encoded
-    - grapheme_len
-    provides: word_emb
-decode_pipeline:
-  batch: true
-  output_keys:
-  - phonemes
-  steps:
-  - func: !name:speechbrain.lobes.models.g2p.dataio.beam_search_pipeline
-      beam_searcher: *id029
-    takes:
-    - char_lens
-    - encoder_out
-    provides:
-    - hyps
-    - scores
-  - func: !apply:speechbrain.utils.hparams.choice
-      value: false
-      choices:
-        true: !apply:speechbrain.lobes.models.g2p.dataio.char_map_detokenize
-          tokenizer: *id022
-          char_map: *id023
-          token_space_index: 512
-          wordwise: true
-        false: !name:speechbrain.lobes.models.g2p.dataio.phoneme_decoder_pipeline
-          phoneme_encoder: *id024
-    takes:
-    - hyps
-    provides:
-    - phonemes
-pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
-  loadables:
-    model: *id014
-    ctc_lin: *id013