Upload 10 files

Browse files

Files changed (6) hide show

config.json +4 -1
generation_config.json +14 -0
generation_utils.py +162 -0
modeling_telechat.py +129 -59
tokenization_telechat.py +220 -0
tokenizer_config.json +2 -2

config.json CHANGED Viewed

@@ -24,6 +24,7 @@
   "offset_alibi": 100,
   "pad_token_id": 3,
   "pretraining_tp": 2,
   "skip_bias_add": true,
   "skip_bias_add_qkv": false,
   "slow_but_exact": false,
@@ -35,6 +36,8 @@
   "flash_attn":true,
   "tie_word_embeddings":false,
   "training_seqlen":8192,
-  "base_seqlen":8192
 }

   "offset_alibi": 100,
   "pad_token_id": 3,
   "pretraining_tp": 2,
+  "seq_length": 8192,
   "skip_bias_add": true,
   "skip_bias_add_qkv": false,
   "slow_but_exact": false,
   "flash_attn":true,
   "tie_word_embeddings":false,
   "training_seqlen":8192,
+  "logn":false,
+  "semi_causal":false,
+  "embed_layernorm":false
 }

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "max_length": 8192,
+  "do_sample": false,
+  "use_cache": true,
+  "temperature": 0.3,
+  "top_k": 5,
+  "top_p": 0.85,
+  "repetition_penalty": 1.01,
+  "pad_token_id": 3,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "user_token_id": 20,
+  "bot_token_id": 21
+}

generation_utils.py ADDED Viewed

	@@ -0,0 +1,162 @@

+from typing import Optional
+from collections import deque
+from queue import Queue
+import copy
+class History:
+    def __init__(self, tokenizer, history):
+        '''
+        init from a list of dict
+        '''
+        # use deque to meet some special situation
+        self.input_history = deque()
+        self.tokenizer = tokenizer
+        if history:
+            self._transfer_from_list(history)
+    def _transfer_from_list(self, history):
+        for message in history:
+            content = message.get("content")
+            # the token result may not be equal to the result model gen
+            message.update(self.tokenizer(content))
+            self.input_history.append(message)
+    def append(self, message):
+        content = message.get("content")
+        if "input_ids" not in message or "attention_mask" not in message:
+            message.update(self.tokenizer(content))
+        self.input_history.append(message)
+    def append_left(self, message):
+        content = message.get("content")
+        if "input_ids" not in message or "attention_mask" not in message:
+            message.update(self.tokenizer(content))
+        self.input_history.appendleft(message)
+    def pop(self):
+        x = self.input_history.pop()
+        return x
+    def pop_left(self):
+        x = self.pop_left()
+        return x
+    def update(self, message):
+        self.input_history.pop()
+        self.append(message)
+    def __len__(self):
+        return self.input_history.__len__()
+    def __str__(self):
+        return self.input_history.__str__()
+    def __copy__(self):
+        new_instance = type(self)(self.tokenizer, [])
+        new_instance.input_history = copy.copy(self.input_history)
+        return new_instance
+    def __deepcopy__(self, memodict={}):
+        new_instance = type(self)(self.tokenizer, [])
+        new_instance.input_history = copy.deepcopy(self.input_history)
+        return new_instance
+class TelechatIterTextStreamer:
+    """
+    With reference to the TextIterStreamers in transformers, we have rewritten this class
+    """
+    def __init__(
+            self, tokenizer, history: History = None, skip_prompt: bool = False, timeout: Optional[float] = None,
+            **decode_kwargs
+    ):
+        self.tokenizer = tokenizer
+        self.history = history
+        self.skip_prompt = skip_prompt
+        self.timeout = timeout
+        self.decode_kwargs = decode_kwargs
+        self.text_queue = Queue()
+        self.cache_time = 0
+        self.text_until = ""
+        self.token_until = []
+        self.stop_signal = None
+        self.next_tokens_are_prompt = True
+        self.history.append({"role": "bot", "content": self.text_until})
+    def put(self, value):
+        """
+        put printable text into queue
+        """
+        if len(value.shape) > 1 and value.shape[0] > 1:
+            raise ValueError("TextStreamer only supports batch size 1")
+        elif len(value.shape) > 1:
+            value = value[0]
+        if self.skip_prompt and self.next_tokens_are_prompt:
+            self.next_tokens_are_prompt = False
+            return
+        if value[-1] == self.tokenizer.eos_token_id:
+            return
+        # there may be some smart way to decode.
+        self.token_until.extend(value.tolist())
+        text = self.tokenizer.decode(self.token_until, **self.decode_kwargs)
+        if self._is_printable(text) or self.cache_time >= 6:
+            output_text = text[len(self.text_until):]
+            self.text_until = text
+        else:
+            self.cache_time+=1
+            return
+        self.on_finalized_text(output_text)
+    def end(self):
+        """Flushes any remaining cache and prints a newline to stdout."""
+        # Flush the cache, if it exists
+        text = self.tokenizer.decode(self.token_until, **self.decode_kwargs)
+        output_text = text[len(self.text_until):]
+        self.text_until = text
+        self.on_finalized_text(output_text, stream_end=True)
+        self.clear_cache()
+    def clear_cache(self):
+        self.cache_time = 0
+        self.token_until = []
+        self.text_until = ""
+        self.history = None
+        self.next_tokens_are_prompt = True
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the text tuple in the queue."""
+        self.history.update({"role": "bot", "content": self.text_until, "input_ids": self.token_until,
+                             "attention_mask": [1] * len(self.token_until)})
+        self.text_queue.put((text, self.history), timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put((self.stop_signal, self.history), timeout=self.timeout)
+    @staticmethod
+    def _is_printable(cp):
+        """Checks whether tokens can be decoded or not"""
+        if "�" in cp:
+            return False
+        return True
+    def __iter__(self):
+        return self
+    def __next__(self):
+        value_now, history_until = self.text_queue.get(timeout=self.timeout)
+        if value_now == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value_now, history_until

modeling_telechat.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # coding=utf-8
 # Copyright 2022 HuggingFace Inc. team and BigScience workshop.
 #
@@ -34,15 +33,15 @@
 # limitations under the License.
 """PyTorch TELECHAT model."""
 import warnings
-from typing import Optional, Tuple, Union
 import torch
 import math
 from torch import nn
 import torch.utils.checkpoint
 from torch.nn import functional as F
@@ -53,8 +52,10 @@ from transformers.modeling_outputs import (
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
 from .configuration_telechat import TelechatConfig
 logger = logging.get_logger(__name__)
@@ -78,63 +79,56 @@ except ImportError:
         flash_attn_unpadded_func = None
 class RotaryEmbedding(torch.nn.Module):
     # Extracted from: https://github.com/EleutherAI/gpt-neox
-    def __init__(self, dim ,config, base=10000, precision=torch.half):
         super().__init__()
         self.config = config
         self.dim = dim
         self.base = base
-        self.inv_freq = 1. / (base ** (torch.arange(0, dim, 2).float().half() / dim)).cuda()
         self.max_seq_len_cached = None
         self.cos_cached = None
         self.sin_cached = None
-        self.precision = precision
-    def get_mscale(self,scale=1):
         if scale <= 1:
             return 1.0
         return 0.1 * math.log(scale) + 1.0
     def get_ntk_alpha(self, true_seq_len):
-        context_value = math.log(true_seq_len / self.config.base_seqlen, 2) + 1
-        # ntk_alpha = 2 ** context_value - 1
         ntk_alpha = 2 ** math.ceil(context_value) - 1
         ntk_alpha = max(ntk_alpha, 1)
         return ntk_alpha
-    def forward(self, x, seq_dim=0, seq_len=None):
-        if seq_len is None:
-            seq_len = x.shape[seq_dim]
-        seq_len = max(seq_len, self.config.training_seqlen)
         ntk_alpha = self.get_ntk_alpha(seq_len)
-        self.mscale = float(self.get_mscale(seq_len / self.config.training_seqlen))
-        if True:
-            base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
-            self.inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=x.device).float( )/ self.dim ))
-            self.max_seq_len_cached = seq_len
-            t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
-            freqs = torch.einsum('i,j->ij', t, self.inv_freq)
-            # Different from paper, but it uses a different permutation in order to obtain the same calculation
-            emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
-            if self.precision == torch.bfloat16:
-                emb = emb.float()
-            # [sx, 1 (b * np), hn]
-            self.cos_cached = self.mscale *emb.cos()[:, None, :].half()
-            self.sin_cached = self.mscale *emb.sin()[:, None, :].half()
-            if self.precision == torch.bfloat16:
-                self.cos_cached = self.cos_cached.bfloat16()
-                self.sin_cached = self.sin_cached.bfloat16()
         return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
 # rotary pos emb helpers:
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
 def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0):  # jitting fails with bf16
     cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
     return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
@@ -192,7 +186,6 @@ class FlashSelfAttention(torch.nn.Module):
         q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
         cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
                                     device=q.device)
-        self.training = False
         if self.training:
             # during training q,k,v always have same seqlen
             assert seqlen_k == seqlen_q
@@ -218,7 +211,6 @@ class FlashSelfAttention(torch.nn.Module):
         return output
 def _make_causal_mask(
         input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
 ) -> torch.BoolTensor:
@@ -249,7 +241,6 @@ def _expand_mask(mask: torch.Tensor, tgt_length: int) -> torch.BoolTensor:
     return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
 def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
     """
     Dropout add function
@@ -332,7 +323,7 @@ class TelechatGelu(nn.Module):
 class TelechatAttention(nn.Module):
-    def __init__(self, config: TelechatConfig ,layer_idx):
         super().__init__()
         self.kv_cache = None
         self.layer_idx = layer_idx
@@ -361,16 +352,13 @@ class TelechatAttention(nn.Module):
         self.key_value = nn.Linear(self.hidden_size, kv_projection_size * 2, bias=False)
         self.dense = nn.Linear(self.hidden_size, self.hidden_size)
         self.attention_dropout = nn.Dropout(config.attention_dropout)
-        self.rotary_emb = RotaryEmbedding(self.head_dim ,config=config)
         self.core_attention_flash = FlashSelfAttention(
             causal=True, attention_dropout=config.attention_dropout
         )
         self.last_key_layer = None
-        #logn_list = [math.log(i, 4096) if i > 4096 else 1 for i in range(1, 32768)]
-        #self.logn_tensor = torch.tensor(logn_list)[None, :, None, None].half().cuda()
     def repeat_kv(self, hidden_states, n_rep):
         slen, batch, num_key_value_heads_per_partition, head_dim = hidden_states.shape
@@ -440,27 +428,26 @@ class TelechatAttention(nn.Module):
         seq_len = key_layer.shape[0]
         offset = 0
-        if  use_cache and layer_past != None:
-            past_key, past_value  = layer_past
             offset = past_key.shape[0]
             seq_len += offset
-        cos, sin = self.rotary_emb(value_layer, seq_len=seq_len)
         query_layer, key_layer = apply_rotary_fn(query_layer, key_layer, cos, sin, offset=offset)
         if use_cache:
             if layer_past != None:
                 past_key, past_value = layer_past
-                key_layer = torch.cat((past_key, key_layer[-1, ...].unsqueeze(0)) ,dim=0)
-                value_layer = torch.cat((past_value ,value_layer[-1 ,...].unsqueeze(0)) ,dim = 0)
-            layer_past = key_layer ,value_layer
         s, bz, head, dim = value_layer.shape
         s_key = key_layer.shape[0]
         s_query = query_layer.shape[0]
         query_layer = query_layer.reshape((s_query, bz, head, dim))
         key_layer = key_layer.reshape((s_key, bz, head, dim))
         if self.config.flash_attn:
             q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() for x in
                        (query_layer, key_layer, value_layer)]
@@ -468,22 +455,23 @@ class TelechatAttention(nn.Module):
             context_layer = rearrange(context_layer, 'b s h d -> b s (h d)').contiguous()
         else:
             ##[sq, b, np, hn] -> [sq, b * np, hn]
-            query_layer = query_layer.reshape(s_query ,bz * self.num_heads, dim)
             # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.reshape(s_key, bz * self.num_heads, dim)
-            matmul_result = self.inv_norm_factor * torch.einsum('bik,bkj->bij', query_layer.transpose(0, 1), key_layer.transpose(0, 1).transpose(1, 2))
             attention_scores = matmul_result.view(bz, self.num_heads, s_query, s_key)
             input_dtype = attention_scores.dtype
-            if input_dtype == torch.float16:
                 attention_scores = attention_scores.to(torch.float)
             attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
             attention_probs = F.softmax(attn_weights, dim=-1).to(input_dtype)  ##dtype = torch.float32
             attention_probs = self.attention_dropout(attention_probs)
             attention_probs_reshaped = attention_probs.view(bz * self.num_heads, s_query, s_key)
-            value_layer = value_layer.reshape(s_key ,bz * self.num_heads, dim)
             context_layer = torch.bmm(attention_probs_reshaped, value_layer.transpose(0, 1))
             context_layer = self._merge_heads(context_layer)
@@ -497,6 +485,7 @@ class TelechatAttention(nn.Module):
         return output_tensor, layer_past
 class TelechatMLP(nn.Module):
     def __init__(self, config: TelechatConfig):
         super().__init__()
@@ -513,14 +502,14 @@ class TelechatMLP(nn.Module):
 class TelechatBlock(nn.Module):
-    def __init__(self, config: TelechatConfig ,layer_idx):
         super().__init__()
         hidden_size = config.hidden_size
         self.input_layernorm = MixedFusedRMSNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.num_heads = config.n_head
         self.layer_idx = layer_idx
-        self.self_attention = TelechatAttention(config ,layer_idx)
         self.post_attention_layernorm = MixedFusedRMSNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.mlp = TelechatMLP(config)
@@ -611,12 +600,11 @@ class TelechatModel(TelechatPreTrainedModel):
         if self.config.embed_layernorm:
             self.word_embeddings_layernorm = MixedFusedRMSNorm(self.embed_dim, eps=config.layer_norm_epsilon)
-        self.h = nn.ModuleList([TelechatBlock(config ,_) for _ in range(config.num_hidden_layers)])
         self.ln_f = MixedFusedRMSNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.gradient_checkpointing = False
         self.post_init()
     def get_input_embeddings(self):
         return self.word_embeddings
@@ -661,7 +649,6 @@ class TelechatModel(TelechatPreTrainedModel):
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
@@ -670,7 +657,6 @@ class TelechatModel(TelechatPreTrainedModel):
         if past_key_values is None:
             past_key_values = tuple([None] * len(self.h))
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
         hidden_states = inputs_embeds
@@ -750,7 +736,8 @@ class TelechatModel(TelechatPreTrainedModel):
 class TelechatForCausalLM(TelechatPreTrainedModel):
     # _tied_weights_keys = ["lm_head.weight"]
-    _keys_to_ignore_on_load_missing = [ r"lm_head.weight"]
     def __init__(self, config: TelechatConfig):
         super().__init__(config)
         self.transformer = TelechatModel(config)
@@ -838,3 +825,86 @@ class TelechatForCausalLM(TelechatPreTrainedModel):
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )

 # coding=utf-8
 # Copyright 2022 HuggingFace Inc. team and BigScience workshop.
 #
 # limitations under the License.
 """PyTorch TELECHAT model."""
 import warnings
+from typing import Optional, Tuple, Union, List, Dict
+from threading import Thread
 import torch
 import math
+import copy
 from torch import nn
 import torch.utils.checkpoint
 from torch.nn import functional as F
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import logging
+from transformers import GenerationConfig
 from .configuration_telechat import TelechatConfig
+from .generation_utils import History, TelechatIterTextStreamer
 logger = logging.get_logger(__name__)
         flash_attn_unpadded_func = None
 class RotaryEmbedding(torch.nn.Module):
     # Extracted from: https://github.com/EleutherAI/gpt-neox
+    def __init__(self, dim, config, base=10000):
         super().__init__()
         self.config = config
         self.dim = dim
         self.base = base
         self.max_seq_len_cached = None
         self.cos_cached = None
         self.sin_cached = None
+    def get_mscale(self, scale=1):
         if scale <= 1:
             return 1.0
         return 0.1 * math.log(scale) + 1.0
     def get_ntk_alpha(self, true_seq_len):
+        context_value = math.log(true_seq_len / 4096, 2) + 1
         ntk_alpha = 2 ** math.ceil(context_value) - 1
         ntk_alpha = max(ntk_alpha, 1)
         return ntk_alpha
+    def forward(self, x, dtype, seq_dim=0):
+        seq_len = x.shape[seq_dim]
+        self.mscale = 1.0
+        if not self.training:
+            seq_len = max(seq_len, self.config.training_seqlen)
+            self.mscale = float(self.get_mscale(seq_len / self.config.training_seqlen))
         ntk_alpha = self.get_ntk_alpha(seq_len)
+        base = self.base * ntk_alpha ** (self.dim / (self.dim - 2))
+        self.inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, device=x.device).float() / self.dim))
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1).to(x.device)
+        # if self.precision == torch.bfloat16:
+        emb = emb.float() if dtype == torch.bfloat16 else emb
+        # [sx, 1 (b * np), hn]
+        self.cos_cached = self.mscale * emb.cos()[:, None, :].to(dtype)
+        self.sin_cached = self.mscale * emb.sin()[:, None, :].to(dtype)
         return self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...]
 # rotary pos emb helpers:
 def rotate_half(x):
     x1, x2 = x[..., :x.shape[-1] // 2], x[..., x.shape[-1] // 2:]
     return torch.cat((-x2, x1), dim=x1.ndim - 1)  # dim=-1 triggers a bug in earlier torch versions
 def apply_rotary_pos_emb_torch(q, k, cos, sin, offset: int = 0):  # jitting fails with bf16
     cos, sin = cos[offset:q.shape[0] + offset, ...], sin[offset:q.shape[0] + offset, ...]
     return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin)
         q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
         cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
                                     device=q.device)
         if self.training:
             # during training q,k,v always have same seqlen
             assert seqlen_k == seqlen_q
         return output
 def _make_causal_mask(
         input_ids_shape: torch.Size, device: torch.device, past_key_values_length: int
 ) -> torch.BoolTensor:
     return expanded_mask.expand(batch_size, 1, tgt_length, src_length)
 def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
     """
     Dropout add function
 class TelechatAttention(nn.Module):
+    def __init__(self, config: TelechatConfig, layer_idx):
         super().__init__()
         self.kv_cache = None
         self.layer_idx = layer_idx
         self.key_value = nn.Linear(self.hidden_size, kv_projection_size * 2, bias=False)
         self.dense = nn.Linear(self.hidden_size, self.hidden_size)
         self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.rotary_emb = RotaryEmbedding(self.head_dim, config=config)
         self.core_attention_flash = FlashSelfAttention(
             causal=True, attention_dropout=config.attention_dropout
         )
         self.last_key_layer = None
     def repeat_kv(self, hidden_states, n_rep):
         slen, batch, num_key_value_heads_per_partition, head_dim = hidden_states.shape
         seq_len = key_layer.shape[0]
         offset = 0
+        if use_cache and layer_past != None:
+            past_key, past_value = layer_past
             offset = past_key.shape[0]
             seq_len += offset
+        cos, sin = self.rotary_emb(value_layer, dtype=value_layer.dtype)
         query_layer, key_layer = apply_rotary_fn(query_layer, key_layer, cos, sin, offset=offset)
         if use_cache:
             if layer_past != None:
                 past_key, past_value = layer_past
+                key_layer = torch.cat((past_key, key_layer[-1, ...].unsqueeze(0)), dim=0)
+                value_layer = torch.cat((past_value, value_layer[-1, ...].unsqueeze(0)), dim=0)
+            layer_past = key_layer, value_layer
         s, bz, head, dim = value_layer.shape
         s_key = key_layer.shape[0]
         s_query = query_layer.shape[0]
         query_layer = query_layer.reshape((s_query, bz, head, dim))
         key_layer = key_layer.reshape((s_key, bz, head, dim))
         if self.config.flash_attn:
             q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous() for x in
                        (query_layer, key_layer, value_layer)]
             context_layer = rearrange(context_layer, 'b s h d -> b s (h d)').contiguous()
         else:
             ##[sq, b, np, hn] -> [sq, b * np, hn]
+            query_layer = query_layer.reshape(s_query, bz * self.num_heads, dim)
             # [sk, b, np, hn] -> [sk, b * np, hn]
             key_layer = key_layer.reshape(s_key, bz * self.num_heads, dim)
+            matmul_result = self.inv_norm_factor * torch.einsum('bik,bkj->bij', query_layer.transpose(0, 1),
+                                                                key_layer.transpose(0, 1).transpose(1, 2))
             attention_scores = matmul_result.view(bz, self.num_heads, s_query, s_key)
             input_dtype = attention_scores.dtype
+            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
                 attention_scores = attention_scores.to(torch.float)
             attn_weights = torch.masked_fill(attention_scores, attention_mask, torch.finfo(attention_scores.dtype).min)
             attention_probs = F.softmax(attn_weights, dim=-1).to(input_dtype)  ##dtype = torch.float32
             attention_probs = self.attention_dropout(attention_probs)
             attention_probs_reshaped = attention_probs.view(bz * self.num_heads, s_query, s_key)
+            value_layer = value_layer.reshape(s_key, bz * self.num_heads, dim)
             context_layer = torch.bmm(attention_probs_reshaped, value_layer.transpose(0, 1))
             context_layer = self._merge_heads(context_layer)
         return output_tensor, layer_past
 class TelechatMLP(nn.Module):
     def __init__(self, config: TelechatConfig):
         super().__init__()
 class TelechatBlock(nn.Module):
+    def __init__(self, config: TelechatConfig, layer_idx):
         super().__init__()
         hidden_size = config.hidden_size
         self.input_layernorm = MixedFusedRMSNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.num_heads = config.n_head
         self.layer_idx = layer_idx
+        self.self_attention = TelechatAttention(config, layer_idx)
         self.post_attention_layernorm = MixedFusedRMSNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.mlp = TelechatMLP(config)
         if self.config.embed_layernorm:
             self.word_embeddings_layernorm = MixedFusedRMSNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.h = nn.ModuleList([TelechatBlock(config, _) for _ in range(config.num_hidden_layers)])
         self.ln_f = MixedFusedRMSNorm(self.embed_dim, eps=config.layer_norm_epsilon)
         self.gradient_checkpointing = False
         self.post_init()
     def get_input_embeddings(self):
         return self.word_embeddings
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if input_ids is not None:
             batch_size, seq_length = input_ids.shape
         elif inputs_embeds is not None:
         if past_key_values is None:
             past_key_values = tuple([None] * len(self.h))
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
         hidden_states = inputs_embeds
 class TelechatForCausalLM(TelechatPreTrainedModel):
     # _tied_weights_keys = ["lm_head.weight"]
+    _keys_to_ignore_on_load_missing = [r"lm_head.weight"]
     def __init__(self, config: TelechatConfig):
         super().__init__(config)
         self.transformer = TelechatModel(config)
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
         )
+    def chat(self, tokenizer, question: str = '', history: Union[List[Dict], History] = None, stream: bool = False,
+             generation_config: Optional[GenerationConfig] = None, **kwargs):
+        """
+        Args:
+            tokenizer:  the tokenizer of  telechat
+            question: question which the model reply in this turn
+            history: history which will format the input for telechat
+            stream: if return the full text at last or yield the text in token
+            generation_config:  configuration for generation
+            **kwargs: args which will update the generation config or pass to model forward
+        """
+        generation_config = generation_config or self.generation_config
+        if not generation_config:
+            logger.error("generation_config is None")
+            raise ValueError("generation_config must not be None")
+        if not question:
+            logger.error("question is empty")
+            raise ValueError("question must not be empty")
+        if history is None:
+            history = []
+        # we update and check generate_config here for building inputs.
+        generation_config = copy.deepcopy(generation_config)
+        user_id = generation_config.user_token_id
+        bot_id = generation_config.bot_token_id
+        model_kwargs = generation_config.update(**kwargs)
+        generation_config.validate()
+        # transfer to History
+        if not isinstance(history, History):
+            history = History(tokenizer, history)
+        inputs = self.build_inputs_for_chat(tokenizer, question, history, generation_config, user_id, bot_id)
+        history.append({"role": "user", "content": question})
+        if stream:
+            streamer = TelechatIterTextStreamer(tokenizer, history,skip_prompt=True)
+            Thread(target=self.generate, kwargs=dict(
+                inputs=inputs.to(self.device), streamer=streamer,
+                generation_config=generation_config, **model_kwargs
+            )).start()
+            return streamer
+        else:
+            outputs = self.generate(inputs.to(self.device), generation_config=generation_config, **model_kwargs)
+            response = tokenizer.decode(outputs[0][len(inputs[0]):-1])
+            history.append({"role": "bot", "content": response})
+            return response, history
+    def build_inputs_for_chat(self, tokenizer, question, history, generation_config, usr_id, bot_id):
+        """
+        check history and  build inputs here
+        """
+        # first tokenize question
+        q_token = tokenizer(question)
+        qa_history = copy.deepcopy(history)
+        # get the max length we should build our inputs in
+        model_max_length = self.config.seq_length
+        build_max_length = max(0, model_max_length - generation_config.max_new_tokens) \
+            if generation_config.max_new_tokens else max(0, generation_config.max_length)
+        if build_max_length < 3:
+            logger.warning("the model can not meet the  requirements of input length,Please check config")
+            raise ValueError("")
+        # trunc left
+        input_tokens = [usr_id] + q_token["input_ids"][-build_max_length + 1:] + [bot_id]
+        length = len(input_tokens)
+        while len(qa_history) != 0:
+            message = qa_history.pop()
+            if message["role"] == "user":
+                tokens = [usr_id] + message["input_ids"]
+            elif message["role"] == "bot":
+                tokens = [bot_id] + message["input_ids"] + [generation_config.eos_token_id]
+            else:
+                tokens = []
+            if len(tokens) + length >= build_max_length:
+                break
+            else:
+                input_tokens = tokens + input_tokens
+        return torch.tensor([input_tokens], dtype=torch.int64)

tokenization_telechat.py ADDED Viewed

	@@ -0,0 +1,220 @@

+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+# TODO: when we get download url from huggingface, refresh the map
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {},
+    "tokenizer_file": {},
+}
+class TelechatTokenizer(PreTrainedTokenizer):
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<_start>",
+        eos_token="<_end>",
+        pad_token="<_pad>",
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+    def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+        return (out_vocab_file,)
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = bos_token_id + token_ids_0 + eos_token_id
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+        return output
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+        return output

tokenizer_config.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
-  "name_or_path": "ChinaTelecom/telechat3-7b",
   "tokenizer_class": "TelechatTokenizer",
   "auto_map": {
     "AutoTokenizer": [
-      "tokenization_telechat3.TelechatTokenizer",
       null
     ]
   },

 {
+  "name_or_path": "ChinaTelecom/telechat-12b",
   "tokenizer_class": "TelechatTokenizer",
   "auto_map": {
     "AutoTokenizer": [
+      "tokenization_telechat.TelechatTokenizer",
       null
     ]
   },