Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support qwen2 1.5b #1782

Merged
merged 9 commits into from
Jun 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class DeepSeekVLReader(LlamaReader):

def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
model_cfg = model_cfg['language_config']
super().__init__(new_params, unused_params, last_bin, model_cfg)

def init_layer_id(self):
Expand Down
8 changes: 4 additions & 4 deletions lmdeploy/turbomind/deploy/source_model/internlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,

def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
kv_head_num = self.model_cfg['kv_head_num']
gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
kv_head_num = self.model_cfg['num_key_value_heads']
gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
qkv = self.params[
f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}']
qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
Expand Down Expand Up @@ -100,8 +100,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,

def _attn(self, i: int, kind: str):
"""Get q, k, v, o qweight for layer i."""
kv_head_num = self.model_cfg['kv_head_num']
gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
kv_head_num = self.model_cfg['num_key_value_heads']
gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
qkv = self.params[
f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}']
hidden_dim = qkv.shape[0]
Expand Down
10 changes: 10 additions & 0 deletions lmdeploy/turbomind/deploy/source_model/internvl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ class InternVL2Reader(InternLM2Reader):
norm_weight_key = 'language_model.model.norm.weight'
output_weight_key = 'language_model.output.weight'

def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
model_cfg = model_cfg.get('llm_config')
super().__init__(new_params, unused_params, last_bin, model_cfg)


@INPUT_MODELS.register_module(name='internvl')
class InternVLModel(LlamaModel):
Expand Down Expand Up @@ -94,6 +99,11 @@ class InternVL2AwqReader(InternLM2AwqReader):
norm_weight_key = 'language_model.model.norm.weight'
output_weight_key = 'language_model.output.weight'

def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
model_cfg = model_cfg.get('llm_config')
super().__init__(new_params, unused_params, last_bin, model_cfg)


@INPUT_MODELS.register_module(name='internvl-awq')
class InternVLAwqModel(InternVLModel):
Expand Down
8 changes: 7 additions & 1 deletion lmdeploy/turbomind/deploy/source_model/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import torch
from safetensors.torch import load_file

from lmdeploy.archs import get_model_arch
from lmdeploy.tokenizer import Tokenizer

from .base import INPUT_MODELS, BaseInputModel, BaseReader
Expand All @@ -28,6 +29,9 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
self.params.update(new_params)
self.last_bin = last_bin
self.model_cfg = model_cfg
tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
if tie_word_embeddings:
self.output_weight_key = self.tok_embeddings_key
self.init_layer_id()

def init_layer_id(self):
Expand Down Expand Up @@ -132,6 +136,8 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
ckpt_path = model_path
self.ckpt_path = ckpt_path
self.ckpt_files = self.get_ckpt()
_, self.model_config = get_model_arch(model_path)
self.model_config = self.model_config.to_dict()

def get_ckpt(self):
"""Get weight files."""
Expand Down Expand Up @@ -164,7 +170,7 @@ def get_mgrs(self):
else:
new_params = load_file(osp.join(self.ckpt_path, ckpt))
ret = self.Reader(new_params, unused_params,
i == self.nmgrs - 1, self.model_info())
i == self.nmgrs - 1, self.model_config)
Comment on lines 172 to +173
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will affect many models, like internlm2, internvl

Copy link
Collaborator Author

@lvhan028 lvhan028 Jun 14, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I understand. However, I believe it is necessary to ensure that the original model configuration is accessible to all source models. Otherwise, the model_info() function should be capable of handling all edge cases.

yield ret
ret.clean_up(is_last_bin)
except GeneratorExit:
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/turbomind/deploy/source_model/xcomposer2.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,

def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
kv_head_num = self.model_cfg['kv_head_num']
gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
kv_head_num = self.model_cfg['num_key_value_heads']
gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
hidden_dim = qkv.shape[-1]
Expand Down
4 changes: 2 additions & 2 deletions lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,

def _attn(self, i: int, kind: str):
"""Get q, k, v, o qweight for layer i."""
kv_head_num = self.model_cfg['kv_head_num']
gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
kv_head_num = self.model_cfg['num_key_value_heads']
gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
hidden_dim = qkv.shape[0]
qkv = qkv.view(hidden_dim, kv_head_num, gs + 2, -1)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_lmdeploy/test_turbomind/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,18 @@ def test_update_from_engine_config():
config = copy.deepcopy(_config)
config.update_from_engine_config(TurbomindEngineConfig())
assert config.tensor_para_size == 1
assert config.session_len == 32776
assert config.session_len == 65544
assert config.max_batch_size == 128
assert config.cache_max_entry_count == 0.8
assert config.quant_policy == 0
assert config.max_prefill_iters == 5
assert config.max_prefill_iters == 9
assert config.num_tokens_per_iter == 8192

config = copy.deepcopy(_config)
config.update_from_engine_config(
TurbomindEngineConfig(max_prefill_token_num=2048,
num_tokens_per_iter=0))
assert config.max_prefill_iters == 17
assert config.max_prefill_iters == 33
assert config.num_tokens_per_iter == 2048

config = copy.deepcopy(_config)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_lmdeploy/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def test_get_and_verify_max_len():
# with PretrainedConfig
config = AutoConfig.from_pretrained('internlm/internlm2-chat-7b',
trust_remote_code=True)
assert (_get_and_verify_max_len(config, None) == 32768)
assert (_get_and_verify_max_len(config, None) == 65536)
assert (_get_and_verify_max_len(config, 1024) == 1024)
assert (_get_and_verify_max_len(config, 102400) == 102400)

Expand Down
Loading