Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GLM-4-9B-Chat #1724

Merged
merged 19 commits into from
Jun 21, 2024
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ For detailed inference benchmarks in more devices and more settings, please refe
<li>Baichuan2 (7B-13B)</li>
<li>Code Llama (7B - 34B)</li>
<li>ChatGLM2 (6B)</li>
<li>GLM4 (9B)</li>
<li>Falcon (7B - 180B)</li>
<li>YI (6B-34B)</li>
<li>Mistral (7B)</li>
Expand Down
1 change: 1 addition & 0 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ LMDeploy TurboMind 引擎拥有卓越的推理能力,在各种规模的模型
<li>Baichuan2 (7B-13B)</li>
<li>Code Llama (7B - 34B)</li>
<li>ChatGLM2 (6B)</li>
<li>GLM4 (9B)</li>
<li>Falcon (7B - 180B)</li>
<li>YI (6B-34B)</li>
<li>Mistral (7B)</li>
Expand Down
1 change: 1 addition & 0 deletions docs/en/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
| InternVL-Chat | v1.1- v1.5 | Yes | Yes | Yes | Yes |
| MiniCPM | Llama3-V-2_5 | Yes | Yes | Yes | Yes |
| MiniGeminiLlama | 7B | Yes | No | No | Yes |
| GLM4 | 9B | Yes | Yes | Yes | No |

"-" means not verified yet.

Expand Down
1 change: 1 addition & 0 deletions docs/zh_cn/supported_models/supported_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
| InternVL-Chat | v1.1- v1.5 | Yes | Yes | Yes | Yes |
| MiniCPM | Llama3-V-2_5 | Yes | Yes | Yes | Yes |
| MiniGeminiLlama | 7B | Yes | No | No | Yes |
| GLM4 | 9B | Yes | Yes | Yes | No |

“-” 表示还没有验证。

Expand Down
73 changes: 72 additions & 1 deletion lmdeploy/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,8 @@ def match(cls, model_path: str) -> Optional[str]:
Args:
model_path (str): the model path used for matching.
"""
if 'chatglm' in model_path.lower():
path = model_path.lower()
if 'chatglm' in path and 'chatglm3' not in path:
return 'chatglm'


Expand Down Expand Up @@ -1458,6 +1459,76 @@ def match(cls, model_path: str) -> Optional[str]:
return 'phi-3'


@MODELS.register_module(name='glm4')
@MODELS.register_module(name='chatglm3')
class Glm4Chat(BaseChatTemplate):
"""Chat template of InternLM model."""

def __init__(self,
system='<|system|>\n',
meta_instruction=None,
eosys='',
user='<|user|>\n',
eoh='',
assistant='<|assistant|>\n',
eoa='',
separator='',
stop_words=['<|user|>', '<|endoftext|>', '<|observation|>'],
**kwargs):
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words,
**kwargs)
self.start = '[gMASK]<sop>'

def get_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.

Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
"""
prompt = super(Glm4Chat, self).get_prompt(prompt, sequence_start)
if sequence_start:
prompt = self.start + prompt
return prompt

def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.

Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
return self.start + super(Glm4Chat, self).messages2prompt(
messages, sequence_start)

@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.

Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'glm-4' in path or 'chatglm3' in path:
return 'glm4'


def best_match_model(query: str) -> Optional[str]:
"""Get the model that matches the query.

Expand Down
78 changes: 63 additions & 15 deletions lmdeploy/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def vocab_size_with_added(self):

@property
def bos_token_id(self):
"""begine of the sentence token id."""
"""begin of the sentence token id."""
return self.model.bos_token_id

@property
Expand Down Expand Up @@ -290,14 +290,12 @@ def indexes_containing_token(self, token: str):
if self.token2id == {}:
# decode is slower than convert_ids_to_tokens
if self.maybe_decode_bytes:
try:
self.token2id = {
self.model.decode(i): i
for i in range(self.vocab_size)
}
except Exception as e:
# qwen-vl
assert str(e) == 'Unclosed image token'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@irexyc Does this influence VLMs?

for i in range(self.vocab_size):
try:
self.token2id[self.model.decode(i)] = i
except: # noqa: E722
# some tokens just can't be decoded by `decode`
pass
else:
self.token2id = {
self.model.convert_ids_to_tokens(i): i
Expand All @@ -324,15 +322,25 @@ def indexes_containing_token(self, token: str):
self._indexes_tokens_deque.append((token, indexes))
return indexes

def encode(self, s: str, add_bos: bool = True, **kwargs):
def encode(self,
s: str,
add_bos: bool = True,
add_special_tokens: bool = True,
**kwargs):
"""Tokenize a prompt.

Args:
s (str): a prompt
add_bos (bool): Whether to add `bos` token id when encoding
the prompt
add_special_tokens (bool): Whether or not to add special tokens
when encoding the prompt
Returns:
list[int]: token ids
"""
encoded = self.model.encode(s, **kwargs)
encoded = self.model.encode(s,
add_special_tokens=add_special_tokens,
**kwargs)
if not add_bos:
# in the middle of a session
if len(encoded) and encoded[0] == self.bos_token_id:
Expand All @@ -349,6 +357,8 @@ def decode(self,
t (List[int]): a list of token ids
offset (int): for incrementally decoding. Default to None, which
means not applied.
skip_special_tokens (bool): Whether or not to remove special
tokens in the decoding.
Returns:
str: text of decoding tokens
"""
Expand Down Expand Up @@ -479,6 +489,26 @@ def __call__(self, s: Union[str, Sequence[str]]):
return self.model(s, add_special_tokens=add_special_tokens)


class ChatGLM4Tokenizer(HuggingFaceTokenizer):
"""tokenizer of GLM4."""

def __init__(self, model_path):
super(ChatGLM4Tokenizer, self).__init__(model_path)

def encode(self,
s: str,
add_bos: bool = True,
add_special_tokens: bool = True,
**kwargs):
"""tokenize a prompt."""
# ChtGLM4Tokenizer hardcode `add_speical_tokens=False` when tokenizing
# a prompt. Refer to https://huggingface.co/THUDM/glm-4-9b-chat/blob/main/tokenization_chatglm.py#L227 # noqa E501
return super(ChatGLM4Tokenizer, self).encode(s,
add_bos,
add_special_tokens=False,
**kwargs)


class Tokenizer:
"""Tokenize prompts or de-tokenize tokens into texts.

Expand All @@ -501,7 +531,15 @@ def __init__(self, model_file: str):
if not use_hf_model:
self.model = SentencePieceTokenizer(model_file)
else:
self.model = HuggingFaceTokenizer(model_folder)
from transformers.models.auto.tokenization_auto import \
get_tokenizer_config
tokenizer_config = get_tokenizer_config(model_folder,
trust_remote_code=True)
config_tokenizer_class = tokenizer_config.get('tokenizer_class')
if config_tokenizer_class == 'ChatGLM4Tokenizer':
self.model = ChatGLM4Tokenizer(model_folder)
else:
self.model = HuggingFaceTokenizer(model_folder)

@property
def vocab_size(self):
Expand All @@ -510,23 +548,31 @@ def vocab_size(self):

@property
def bos_token_id(self):
"""begine of the sentence token id."""
"""begin of the sentence token id."""
return self.model.bos_token_id

@property
def eos_token_id(self):
"""end of the sentence token id."""
return self.model.eos_token_id

def encode(self, s: str, add_bos: bool = True, **kwargs):
def encode(self,
s: str,
add_bos: bool = True,
add_special_tokens: bool = True,
**kwargs):
"""Tokenize a prompt.

Args:
s (str): a prompt
add_bos (bool): Whether to add `bos` token id when encoding
the prompt
add_special_tokens (bool): Whether or not to add special tokens
when encoding the prompt
Returns:
list[int]: token ids
"""
return self.model.encode(s, add_bos, **kwargs)
return self.model.encode(s, add_bos, add_special_tokens, **kwargs)

def decode(
self,
Expand All @@ -540,6 +586,8 @@ def decode(
t (List[int]): a list of token ids
offset (int): for incrementally decoding. Default to None, which
means not applied.
skip_special_tokens (bool): Whether or not to remove special
tokens in the decoding.
Returns:
str: text of decoding tokens
"""
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/source_model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel # noqa: F401
from .deepseek_vl import DeepSeekVLModel # noqa: F401
from .deepseek_vl_awq import DeepSeekVLAwqModel # noqa: F401
from .glm4 import Glm4Model # noqa: F401
from .internlm2 import InternLM2AwqModel, InternLM2Model # noqa: F401
from .internvl import InternVLModel # noqa: F401
from .llama import LlamaModel # noqa: F401
Expand Down
Loading
Loading