InternLM · lvhan028 · Jun 17, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
@@ -16,6 +16,7 @@ class DeepSeekVLReader(LlamaReader):
 
     def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
                  model_cfg: dict):
+        model_cfg = model_cfg['language_config']
         super().__init__(new_params, unused_params, last_bin, model_cfg)
 
     def init_layer_id(self):

diff --git a/lmdeploy/turbomind/deploy/source_model/internlm2.py b/lmdeploy/turbomind/deploy/source_model/internlm2.py
@@ -21,8 +21,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
 
     def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
         """Get q, k, v, o kind for layer i."""
-        kv_head_num = self.model_cfg['kv_head_num']
-        gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
         qkv = self.params[
             f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}']
         qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
@@ -100,8 +100,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
 
     def _attn(self, i: int, kind: str):
         """Get q, k, v, o qweight for layer i."""
-        kv_head_num = self.model_cfg['kv_head_num']
-        gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
         qkv = self.params[
             f'{self.attn_layer_prefix}.{i}.attention.wqkv.{kind}']
         hidden_dim = qkv.shape[0]

diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -27,6 +27,11 @@ class InternVL2Reader(InternLM2Reader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.output.weight'
 
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        model_cfg = model_cfg.get('llm_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg)
+
 
 @INPUT_MODELS.register_module(name='internvl')
 class InternVLModel(LlamaModel):
@@ -94,6 +99,11 @@ class InternVL2AwqReader(InternLM2AwqReader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.output.weight'
 
+    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
+                 model_cfg: dict):
+        model_cfg = model_cfg.get('llm_config')
+        super().__init__(new_params, unused_params, last_bin, model_cfg)
+
 
 @INPUT_MODELS.register_module(name='internvl-awq')
 class InternVLAwqModel(InternVLModel):

diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -7,6 +7,7 @@
 import torch
 from safetensors.torch import load_file
 
+from lmdeploy.archs import get_model_arch
 from lmdeploy.tokenizer import Tokenizer
 
 from .base import INPUT_MODELS, BaseInputModel, BaseReader
@@ -28,6 +29,9 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
         self.params.update(new_params)
         self.last_bin = last_bin
         self.model_cfg = model_cfg
+        tie_word_embeddings = self.model_cfg.get('tie_word_embeddings', False)
+        if tie_word_embeddings:
+            self.output_weight_key = self.tok_embeddings_key
         self.init_layer_id()
 
     def init_layer_id(self):
@@ -132,6 +136,8 @@ def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
             ckpt_path = model_path
         self.ckpt_path = ckpt_path
         self.ckpt_files = self.get_ckpt()
+        _, self.model_config = get_model_arch(model_path)
+        self.model_config = self.model_config.to_dict()
 
     def get_ckpt(self):
         """Get weight files."""
@@ -164,7 +170,7 @@ def get_mgrs(self):
                 else:
                     new_params = load_file(osp.join(self.ckpt_path, ckpt))
                 ret = self.Reader(new_params, unused_params,
-                                  i == self.nmgrs - 1, self.model_info())
+                                  i == self.nmgrs - 1, self.model_config)
                 yield ret
                 ret.clean_up(is_last_bin)
         except GeneratorExit:

diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2.py
@@ -22,8 +22,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
 
     def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
         """Get q, k, v, o kind for layer i."""
-        kv_head_num = self.model_cfg['kv_head_num']
-        gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
         qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
         qkv = qkv.view(kv_head_num, gs + 2, 128, -1)
         hidden_dim = qkv.shape[-1]

diff --git a/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py b/lmdeploy/turbomind/deploy/source_model/xcomposer2_awq.py
@@ -15,8 +15,8 @@ def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
 
     def _attn(self, i: int, kind: str):
         """Get q, k, v, o qweight for layer i."""
-        kv_head_num = self.model_cfg['kv_head_num']
-        gs = int(self.model_cfg['attn_head_num'] / kv_head_num)
+        kv_head_num = self.model_cfg['num_key_value_heads']
+        gs = int(self.model_cfg['num_attention_heads'] / kv_head_num)
         qkv = self.params[f'model.layers.{i}.attention.wqkv.{kind}']
         hidden_dim = qkv.shape[0]
         qkv = qkv.view(hidden_dim, kv_head_num, gs + 2, -1)

diff --git a/tests/test_lmdeploy/test_turbomind/test_converter.py b/tests/test_lmdeploy/test_turbomind/test_converter.py
@@ -57,18 +57,18 @@ def test_update_from_engine_config():
     config = copy.deepcopy(_config)
     config.update_from_engine_config(TurbomindEngineConfig())
     assert config.tensor_para_size == 1
-    assert config.session_len == 32776
+    assert config.session_len == 65544
     assert config.max_batch_size == 128
     assert config.cache_max_entry_count == 0.8
     assert config.quant_policy == 0
-    assert config.max_prefill_iters == 5
+    assert config.max_prefill_iters == 9
     assert config.num_tokens_per_iter == 8192
 
     config = copy.deepcopy(_config)
     config.update_from_engine_config(
         TurbomindEngineConfig(max_prefill_token_num=2048,
                               num_tokens_per_iter=0))
-    assert config.max_prefill_iters == 17
+    assert config.max_prefill_iters == 33
     assert config.num_tokens_per_iter == 2048
 
     config = copy.deepcopy(_config)

diff --git a/tests/test_lmdeploy/test_utils.py b/tests/test_lmdeploy/test_utils.py
@@ -8,7 +8,7 @@ def test_get_and_verify_max_len():
     # with PretrainedConfig
     config = AutoConfig.from_pretrained('internlm/internlm2-chat-7b',
                                         trust_remote_code=True)
-    assert (_get_and_verify_max_len(config, None) == 32768)
+    assert (_get_and_verify_max_len(config, None) == 65536)
     assert (_get_and_verify_max_len(config, 1024) == 1024)
     assert (_get_and_verify_max_len(config, 102400) == 102400)