InternLM · lvhan028 · Jun 21, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/README.md b/README.md
@@ -115,6 +115,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
   <li>ChatGLM2 (6B)</li>
+  <li>GLM4 (9B)</li>
   <li>Falcon (7B - 180B)</li>
   <li>YI (6B-34B)</li>
   <li>Mistral (7B)</li>

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -116,6 +116,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力，在各种规模的模型
   <li>Baichuan2 (7B-13B)</li>
   <li>Code Llama (7B - 34B)</li>
   <li>ChatGLM2 (6B)</li>
+  <li>GLM4 (9B)</li>
   <li>Falcon (7B - 180B)</li>
   <li>YI (6B-34B)</li>
   <li>Mistral (7B)</li>

diff --git a/docs/en/supported_models/supported_models.md b/docs/en/supported_models/supported_models.md
@@ -24,6 +24,7 @@
 |    InternVL-Chat    |  v1.1- v1.5  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       MiniCPM       | Llama3-V-2_5 |    Yes    |   Yes   |   Yes   |  Yes  |
 |   MiniGeminiLlama   |      7B      |    Yes    |   No    |   No    |  Yes  |
+|        GLM4         |      9B      |    Yes    |   Yes   |   Yes   |  No   |
 
 "-" means not verified yet.
 

diff --git a/docs/zh_cn/supported_models/supported_models.md b/docs/zh_cn/supported_models/supported_models.md
@@ -24,6 +24,7 @@
 |    InternVL-Chat    |  v1.1- v1.5  |    Yes    |   Yes   |   Yes   |  Yes  |
 |       MiniCPM       | Llama3-V-2_5 |    Yes    |   Yes   |   Yes   |  Yes  |
 |   MiniGeminiLlama   |      7B      |    Yes    |   No    |   No    |  Yes  |
+|        GLM4         |      9B      |    Yes    |   Yes   |   Yes   |  No   |
 
 “-” 表示还没有验证。
 

diff --git a/lmdeploy/model.py b/lmdeploy/model.py
@@ -950,7 +950,8 @@ def match(cls, model_path: str) -> Optional[str]:
         Args:
             model_path (str): the model path used for matching.
         """
-        if 'chatglm' in model_path.lower():
+        path = model_path.lower()
+        if 'chatglm' in path and 'chatglm3' not in path:
             return 'chatglm'
 
 
@@ -1458,6 +1459,76 @@ def match(cls, model_path: str) -> Optional[str]:
             return 'phi-3'
 
 
+@MODELS.register_module(name='glm4')
+@MODELS.register_module(name='chatglm3')
+class Glm4Chat(BaseChatTemplate):
+    """Chat template of InternLM model."""
+
+    def __init__(self,
+                 system='<|system|>\n',
+                 meta_instruction=None,
+                 eosys='',
+                 user='<|user|>\n',
+                 eoh='',
+                 assistant='<|assistant|>\n',
+                 eoa='',
+                 separator='',
+                 stop_words=['<|user|>', '<|endoftext|>', '<|observation|>'],
+                 **kwargs):
+        super().__init__(system=system,
+                         meta_instruction=meta_instruction,
+                         eosys=eosys,
+                         user=user,
+                         eoh=eoh,
+                         assistant=assistant,
+                         eoa=eoa,
+                         separator=separator,
+                         stop_words=stop_words,
+                         **kwargs)
+        self.start = '[gMASK]<sop>'
+
+    def get_prompt(self, prompt, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            prompt (str): user's input prompt
+            sequence_start (bool): indicator for the first round chat of a
+               session sequence
+        Returns:
+            str: the concatenated prompt
+        """
+        prompt = super(Glm4Chat, self).get_prompt(prompt, sequence_start)
+        if sequence_start:
+            prompt = self.start + prompt
+        return prompt
+
+    def messages2prompt(self, messages, sequence_start=True):
+        """Return the prompt that is concatenated with other elements in the
+        chat template.
+
+        Args:
+            messages (str | List): user's input prompt
+        Returns:
+            str: the concatenated prompt
+        """
+        if isinstance(messages, str):
+            return self.get_prompt(messages, sequence_start)
+        return self.start + super(Glm4Chat, self).messages2prompt(
+            messages, sequence_start)
+
+    @classmethod
+    def match(cls, model_path: str) -> Optional[str]:
+        """Return the model_name that was registered to MODELS.
+
+        Args:
+            model_path (str): the model path used for matching.
+        """
+        path = model_path.lower()
+        if 'glm-4' in path or 'chatglm3' in path:
+            return 'glm4'
+
+
 def best_match_model(query: str) -> Optional[str]:
     """Get the model that matches the query.
 

diff --git a/lmdeploy/tokenizer.py b/lmdeploy/tokenizer.py
@@ -235,7 +235,7 @@ def vocab_size_with_added(self):
 
     @property
     def bos_token_id(self):
-        """begine of the sentence token id."""
+        """begin of the sentence token id."""
         return self.model.bos_token_id
 
     @property
@@ -290,14 +290,12 @@ def indexes_containing_token(self, token: str):
         if self.token2id == {}:
             # decode is slower than convert_ids_to_tokens
             if self.maybe_decode_bytes:
-                try:
-                    self.token2id = {
-                        self.model.decode(i): i
-                        for i in range(self.vocab_size)
-                    }
-                except Exception as e:
-                    # qwen-vl
-                    assert str(e) == 'Unclosed image token'
+                for i in range(self.vocab_size):
+                    try:
+                        self.token2id[self.model.decode(i)] = i
+                    except:  # noqa: E722
+                        # some tokens just can't be decoded by `decode`
+                        pass
             else:
                 self.token2id = {
                     self.model.convert_ids_to_tokens(i): i
@@ -324,15 +322,25 @@ def indexes_containing_token(self, token: str):
         self._indexes_tokens_deque.append((token, indexes))
         return indexes
 
-    def encode(self, s: str, add_bos: bool = True, **kwargs):
+    def encode(self,
+               s: str,
+               add_bos: bool = True,
+               add_special_tokens: bool = True,
+               **kwargs):
         """Tokenize a prompt.
 
         Args:
             s (str): a prompt
+            add_bos (bool): Whether to add `bos` token id when encoding
+                the prompt
+            add_special_tokens (bool): Whether or not to add special tokens
+                when encoding the prompt
         Returns:
             list[int]: token ids
         """
-        encoded = self.model.encode(s, **kwargs)
+        encoded = self.model.encode(s,
+                                    add_special_tokens=add_special_tokens,
+                                    **kwargs)
         if not add_bos:
             # in the middle of a session
             if len(encoded) and encoded[0] == self.bos_token_id:
@@ -349,6 +357,8 @@ def decode(self,
             t (List[int]): a list of token ids
             offset (int): for incrementally decoding. Default to None, which
                 means not applied.
+            skip_special_tokens (bool): Whether or not to remove special
+                tokens in the decoding.
         Returns:
             str: text of decoding tokens
         """
@@ -479,6 +489,26 @@ def __call__(self, s: Union[str, Sequence[str]]):
         return self.model(s, add_special_tokens=add_special_tokens)
 
 
+class ChatGLM4Tokenizer(HuggingFaceTokenizer):
+    """tokenizer of GLM4."""
+
+    def __init__(self, model_path):
+        super(ChatGLM4Tokenizer, self).__init__(model_path)
+
+    def encode(self,
+               s: str,
+               add_bos: bool = True,
+               add_special_tokens: bool = True,
+               **kwargs):
+        """tokenize a prompt."""
+        # ChtGLM4Tokenizer hardcode `add_speical_tokens=False` when tokenizing
+        # a prompt. Refer to https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/tokenization_chatglm.py#L227 # noqa E501
+        return super(ChatGLM4Tokenizer, self).encode(s,
+                                                     add_bos,
+                                                     add_special_tokens=False,
+                                                     **kwargs)
+
+
 class Tokenizer:
     """Tokenize prompts or de-tokenize tokens into texts.
 
@@ -501,7 +531,15 @@ def __init__(self, model_file: str):
         if not use_hf_model:
             self.model = SentencePieceTokenizer(model_file)
         else:
-            self.model = HuggingFaceTokenizer(model_folder)
+            from transformers.models.auto.tokenization_auto import \
+                get_tokenizer_config
+            tokenizer_config = get_tokenizer_config(model_folder,
+                                                    trust_remote_code=True)
+            config_tokenizer_class = tokenizer_config.get('tokenizer_class')
+            if config_tokenizer_class == 'ChatGLM4Tokenizer':
+                self.model = ChatGLM4Tokenizer(model_folder)
+            else:
+                self.model = HuggingFaceTokenizer(model_folder)
 
     @property
     def vocab_size(self):
@@ -510,23 +548,31 @@ def vocab_size(self):
 
     @property
     def bos_token_id(self):
-        """begine of the sentence token id."""
+        """begin of the sentence token id."""
         return self.model.bos_token_id
 
     @property
     def eos_token_id(self):
         """end of the sentence token id."""
         return self.model.eos_token_id
 
-    def encode(self, s: str, add_bos: bool = True, **kwargs):
+    def encode(self,
+               s: str,
+               add_bos: bool = True,
+               add_special_tokens: bool = True,
+               **kwargs):
         """Tokenize a prompt.
 
         Args:
             s (str): a prompt
+            add_bos (bool): Whether to add `bos` token id when encoding
+                the prompt
+            add_special_tokens (bool): Whether or not to add special tokens
+                when encoding the prompt
         Returns:
             list[int]: token ids
         """
-        return self.model.encode(s, add_bos, **kwargs)
+        return self.model.encode(s, add_bos, add_special_tokens, **kwargs)
 
     def decode(
         self,
@@ -540,6 +586,8 @@ def decode(
             t (List[int]): a list of token ids
             offset (int): for incrementally decoding. Default to None, which
                 means not applied.
+            skip_special_tokens (bool): Whether or not to remove special
+                tokens in the decoding.
         Returns:
             str: text of decoding tokens
         """

diff --git a/lmdeploy/turbomind/deploy/source_model/__init__.py b/lmdeploy/turbomind/deploy/source_model/__init__.py
@@ -3,6 +3,7 @@
 from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel  # noqa: F401
 from .deepseek_vl import DeepSeekVLModel  # noqa: F401
 from .deepseek_vl_awq import DeepSeekVLAwqModel  # noqa: F401
+from .glm4 import Glm4Model  # noqa: F401
 from .internlm2 import InternLM2AwqModel, InternLM2Model  # noqa: F401
 from .internvl import InternVLModel  # noqa: F401
 from .llama import LlamaModel  # noqa: F401