InternLM · lvhan028 · Jun 25, 2024 · Feb 19, 2024 · Feb 20, 2024 · Mar 21, 2024
diff --git a/lmdeploy/cli/chat.py b/lmdeploy/cli/chat.py
@@ -75,10 +75,6 @@ def add_parser_turbomind():
         ArgumentHelper.cache_block_seq_len(engine_group)
         ArgumentHelper.rope_scaling_factor(engine_group)
         ArgumentHelper.session_len(engine_group)
-        # other arguments
-        ArgumentHelper.cap(parser)
-        ArgumentHelper.meta_instruction(parser)  # TODO remove
-        ArgumentHelper.chat_template(parser)
         # model args
         ArgumentHelper.revision(engine_group)
         ArgumentHelper.download_dir(engine_group)
@@ -124,16 +120,6 @@ def turbomind(args):
             ' future. Please use `lmdeploy chat` instead.')
 
         kwargs = convert_args(args)
-        from lmdeploy.model import ChatTemplateConfig
-        chat_template_config = ChatTemplateConfig(
-            model_name=args.model_name,
-            meta_instruction=args.meta_instruction,
-            capability=args.cap)
-        if args.chat_template:
-            chat_template_config = ChatTemplateConfig.from_json(
-                args.chat_template)
-        kwargs.update(dict(chat_template_cfg=chat_template_config))
-        kwargs.pop('chat_template', None)
         main(**kwargs)
 
     @staticmethod

diff --git a/lmdeploy/cli/cli.py b/lmdeploy/cli/cli.py
@@ -121,7 +121,6 @@ def add_parser_chat():
         tp_act = ArgumentHelper.tp(pt_group)
         model_name_act = ArgumentHelper.model_name(pt_group)
         session_len_act = ArgumentHelper.session_len(pt_group)
-        max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
         cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
         prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)
 
@@ -131,7 +130,6 @@ def add_parser_chat():
         tb_group._group_actions.append(tp_act)
         tb_group._group_actions.append(model_name_act)
         tb_group._group_actions.append(session_len_act)
-        tb_group._group_actions.append(max_batch_size_act)
         tb_group._group_actions.append(cache_max_entry_act)
         tb_group._group_actions.append(prefix_caching_act)
         ArgumentHelper.model_format(tb_group)
@@ -268,7 +266,10 @@ def chat(args):
             from lmdeploy.turbomind.chat import main as run_chat
             kwargs = convert_args(args)
             kwargs.pop('chat_template')
-            kwargs['chat_template_cfg'] = chat_template_config
+            kwargs.pop('meta_instruction')
+            kwargs.pop('trust_remote_code')
+            kwargs.pop('backend')
+            kwargs['chat_template_config'] = chat_template_config
             run_chat(**kwargs)
 
     @staticmethod

diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -188,7 +188,6 @@ def __init__(self,
         if backend == 'turbomind':
             self._build_turbomind(model_path=model_path,
                                   backend_config=backend_config,
-                                  chat_template_config=chat_template_config,
                                   tp=tp,
                                   **kwargs)
         elif backend == 'pytorch':
@@ -222,7 +221,6 @@ def _build_turbomind(
             model_path: str,
             backend_config: Optional[Union[TurbomindEngineConfig,
                                            PytorchEngineConfig]] = None,
-            chat_template_config: Optional[ChatTemplateConfig] = None,
             tp: int = 1,
             **kwargs):
         """Innter build method for turbomind backend."""
@@ -234,10 +232,7 @@ def _build_turbomind(
             'turbomind backend'
         from lmdeploy import turbomind as tm
         self.engine = tm.TurboMind.from_pretrained(
-            model_path,
-            engine_config=backend_config,
-            chat_template_config=chat_template_config,
-            **kwargs)
+            model_path, engine_config=backend_config, **kwargs)
         self.backend_config = backend_config
         self.hf_tm_cfg = self.engine.config
 

diff --git a/lmdeploy/turbomind/chat.py b/lmdeploy/turbomind/chat.py
@@ -3,10 +3,17 @@
 import random
 
 from lmdeploy.messages import EngineGenerationConfig, TurbomindEngineConfig
-from lmdeploy.model import ChatTemplateConfig
+from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
+from lmdeploy.serve.async_engine import deduce_a_name
 from lmdeploy.tokenizer import DetokenizeState
+from lmdeploy.utils import _stop_words
 
-os.environ['TM_LOG_LEVEL'] = 'ERROR'
+log_level = 'ERROR'
+if os.getenv('TM_LOG_LEVEL') is None:
+    os.environ['TM_LOG_LEVEL'] = log_level
+    from lmdeploy.utils import get_logger
+    logger = get_logger('lmdeploy')
+    logger.setLevel(log_level)
 
 
 def input_prompt(model_name):
@@ -20,24 +27,26 @@ def input_prompt(model_name):
     return '\n'.join(iter(input, sentinel))
 
 
-def valid_str(string, coding='utf-8'):
-    """decode text according to its encoding type."""
-    invalid_chars = [b'\xef\xbf\xbd']
-    bstr = bytes(string, coding)
-    for invalid_char in invalid_chars:
-        bstr = bstr.replace(invalid_char, b'')
-    ret = bstr.decode(encoding=coding, errors='ignore')
-    return ret
-
-
 def main(model_path: str,
          model_name: str = None,
          session_id: int = 1,
+         top_k: float = 40,
+         top_p: float = 0.8,
+         temperature: float = 0.8,
+         repetition_penalty: float = 1.0,
          cap: str = 'chat',
          tp: int = 1,
+         max_batch_size: int = 1,
+         model_format: str = None,
+         quant_policy: int = 0,
+         cache_max_entry_count: float = 0.8,
+         cache_block_seq_len: int = 64,
+         rope_scaling_factor: float = 0.0,
+         enable_prefix_caching: bool = False,
+         session_len: int = None,
          stream_output: bool = True,
          request_output_len: int = 1024,
-         chat_template_cfg: ChatTemplateConfig = None,
+         chat_template_config: ChatTemplateConfig = None,
          **kwargs):
     """An example to perform model inference through the command line
     interface.
@@ -46,100 +55,125 @@ def main(model_path: str,
         model_path (str): the path of the deployed model
         model_name (str): the name of deployed model
         session_id (int): the identical id of a session
-        cap (str): the capability of a model. For example, codellama has
-            the ability among ['completion', 'infilling', 'chat', 'python']
+        top_k (int): sampling top k.
+        top_p (int): sampling top p.
+        temperature (float): sampling temperature.
+        repetition_penalty (float): parameter to penalize repetition
+        cap (str): the capability of a model. For example, codellama has the ability among ['completion', 'infilling', 'chat', 'python']
         tp (int): GPU number used in tensor parallelism
+        max_batch_size (int): max batch size
+        model_format (str): the layout of the deployed model. It can be one of the following values [hf, llama, awq]
+        quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
+        cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
+        cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
+        rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
+        enable_prefix_caching (bool): whether enable prefix caching
+        session_len (int): the length input output tokens
         stream_output (bool): indicator for streaming output or not
         request_output_len (int): output token nums
-        chat_template_cfg (ChatTemplateConfig): Chat template config
-        **kwarg (dict): other arguments for initializing model's chat template
-    """
-    from lmdeploy import turbomind as tm
-    if chat_template_cfg is None:
-        chat_template_cfg = ChatTemplateConfig(model_name=model_name,
-                                               capability=cap)
-        new_kwargs = {}
-        for k, v in kwargs.items():
-            if hasattr(chat_template_cfg, k):
-                setattr(chat_template_cfg, k, v)
-            else:
-                new_kwargs[k] = v
-        kwargs = new_kwargs
+        chat_template_config (ChatTemplateConfig): chat template config
+        kwargs (dict): unused args
+    """ # noqa: E 501
+
+    # chat template
+    model_name = deduce_a_name(model_path, model_name, None,
+                               chat_template_config)
+    if model_name in MODELS.module_dict.keys():
+        chat_template_name = model_name
+    else:
+        chat_template_name = best_match_model(model_path)
+    if chat_template_config is None:
+        chat_template_config = ChatTemplateConfig(chat_template_name)
+    elif chat_template_config.model_name is None:
+        chat_template_config.model_name = chat_template_name
+    if chat_template_config.capability is None:
+        chat_template_config.capability = cap
+    print('chat_template_config:\n', chat_template_config, sep='', flush=True)
+    model = chat_template_config.chat_template
+
+    # engine
+    if session_len is None:
+        session_len = model.session_len
+
+    engine_cfg = TurbomindEngineConfig(
+        max_batch_size=max_batch_size,
+        model_name=model_name,
+        model_format=model_format,
+        session_len=session_len,
+        cache_max_entry_count=cache_max_entry_count,
+        cache_block_seq_len=cache_block_seq_len,
+        enable_prefix_caching=enable_prefix_caching,
+        quant_policy=quant_policy,
+        rope_scaling_factor=rope_scaling_factor,
+        tp=tp)
+    print('engine_cfg:\n', engine_cfg, sep='', flush=True)
 
-    engine_cfg = TurbomindEngineConfig(model_name=model_name, tp=tp)
-    for k, v in kwargs.items():
-        if hasattr(engine_cfg, k):
-            setattr(engine_cfg, k, v)
+    from lmdeploy import turbomind as tm
+    tm_model = tm.TurboMind.from_pretrained(model_path,
+                                            engine_config=engine_cfg)
+    generator = tm_model.create_instance()
 
-    tm_model = tm.TurboMind.from_pretrained(
-        model_path,
-        model_name=model_name,
-        engine_config=engine_cfg,
-        capability=cap,
-        chat_template_config=chat_template_cfg,
-        **kwargs)
+    # generateion config
     tokenizer = tm_model.tokenizer
-    generator = tm_model.create_instance()
-    gen_config = EngineGenerationConfig(top_k=40)
+    stop_words = _stop_words(model.stop_words, tokenizer)
+    if stop_words is not None:
+        stop_words = stop_words[0][0].tolist()
+
+    gen_config = EngineGenerationConfig(max_new_tokens=request_output_len,
+                                        top_k=top_k,
+                                        top_p=top_p,
+                                        temperature=temperature,
+                                        repetition_penalty=repetition_penalty,
+                                        stop_words=stop_words)
 
     nth_round = 1
     step = 0
     seed = random.getrandbits(64)
-    model_name = tm_model.model_name
-    model = tm_model.model
-
-    print(f'session {session_id}')
     while True:
         prompt = input_prompt(model_name)
         if prompt == 'exit':
             exit(0)
         elif prompt == 'end':
-            prompt = model.get_prompt('', nth_round == 1)
-            input_ids = tokenizer.encode(prompt)
-            for outputs in generator.stream_infer(
-                    session_id=session_id,
-                    input_ids=[input_ids],
-                    request_output_len=request_output_len,
-                    sequence_start=False,
-                    sequence_end=True,
-                    stream_output=stream_output):
-                pass
+            generator.end(session_id)
             nth_round = 1
             step = 0
             seed = random.getrandbits(64)
         else:
             prompt = model.get_prompt(prompt, nth_round == 1)
             input_ids = tokenizer.encode(prompt, nth_round == 1)
+            gen_config.random_seed = seed
+
+            if model.capability == 'chat':
+                sequence_start = (nth_round == 1)
+                sequence_end = False
+                step = step
+            else:
+                sequence_start = True
+                sequence_end = True
+                step = 0
+
             if step + len(
                     input_ids) + request_output_len >= tm_model.session_len:
                 print('WARNING: exceed session max length.'
                       ' Please end the session.')
                 continue
 
-            sequence_start = (nth_round == 1)
-            sequence_end = False
-            if cap != 'chat':  # not interactive for other capability
-                sequence_start, sequence_end = True, True
-                step = 0
-
             print(f'{prompt}', end='', flush=True)
             state = DetokenizeState(len(input_ids))
             for outputs in generator.stream_infer(
                     session_id=session_id,
                     input_ids=[input_ids],
+                    gen_config=gen_config,
                     sequence_start=sequence_start,
                     sequence_end=sequence_end,
                     step=step,
-                    stream_output=stream_output,
-                    gen_config=gen_config,
-                    ignore_eos=False,
-                    random_seed=seed if nth_round == 1 else None):
+                    stream_output=stream_output):
+
                 res, tokens = input_ids + outputs.token_ids, outputs.num_token
                 # decode res
                 response, state = tokenizer.detokenize_incrementally(
                     res, state=state)
-                response = valid_str(response)
-                print(f'{response}', end='', flush=True)
+                print(response, end='', flush=True)
 
             # update step
             step += len(input_ids) + tokens