Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove chat template config in turbomind engine #1161

Merged
merged 16 commits into from
Jun 25, 2024
Merged
14 changes: 0 additions & 14 deletions lmdeploy/cli/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@ def add_parser_turbomind():
ArgumentHelper.cache_block_seq_len(engine_group)
ArgumentHelper.rope_scaling_factor(engine_group)
ArgumentHelper.session_len(engine_group)
# other arguments
ArgumentHelper.cap(parser)
ArgumentHelper.meta_instruction(parser) # TODO remove
ArgumentHelper.chat_template(parser)
# model args
ArgumentHelper.revision(engine_group)
ArgumentHelper.download_dir(engine_group)
Expand Down Expand Up @@ -124,16 +120,6 @@ def turbomind(args):
' future. Please use `lmdeploy chat` instead.')

kwargs = convert_args(args)
from lmdeploy.model import ChatTemplateConfig
RunningLeon marked this conversation as resolved.
Show resolved Hide resolved
chat_template_config = ChatTemplateConfig(
model_name=args.model_name,
meta_instruction=args.meta_instruction,
capability=args.cap)
if args.chat_template:
chat_template_config = ChatTemplateConfig.from_json(
args.chat_template)
kwargs.update(dict(chat_template_cfg=chat_template_config))
kwargs.pop('chat_template', None)
main(**kwargs)

@staticmethod
Expand Down
7 changes: 4 additions & 3 deletions lmdeploy/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def add_parser_chat():
tp_act = ArgumentHelper.tp(pt_group)
model_name_act = ArgumentHelper.model_name(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
prefix_caching_act = ArgumentHelper.enable_prefix_caching(pt_group)

Expand All @@ -131,7 +130,6 @@ def add_parser_chat():
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(model_name_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
tb_group._group_actions.append(cache_max_entry_act)
tb_group._group_actions.append(prefix_caching_act)
ArgumentHelper.model_format(tb_group)
Expand Down Expand Up @@ -268,7 +266,10 @@ def chat(args):
from lmdeploy.turbomind.chat import main as run_chat
kwargs = convert_args(args)
kwargs.pop('chat_template')
kwargs['chat_template_cfg'] = chat_template_config
kwargs.pop('meta_instruction')
kwargs.pop('trust_remote_code')
kwargs.pop('backend')
kwargs['chat_template_config'] = chat_template_config
run_chat(**kwargs)

@staticmethod
Expand Down
7 changes: 1 addition & 6 deletions lmdeploy/serve/async_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ def __init__(self,
if backend == 'turbomind':
self._build_turbomind(model_path=model_path,
backend_config=backend_config,
chat_template_config=chat_template_config,
tp=tp,
**kwargs)
elif backend == 'pytorch':
Expand Down Expand Up @@ -222,7 +221,6 @@ def _build_turbomind(
model_path: str,
backend_config: Optional[Union[TurbomindEngineConfig,
PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
tp: int = 1,
**kwargs):
"""Innter build method for turbomind backend."""
Expand All @@ -234,10 +232,7 @@ def _build_turbomind(
'turbomind backend'
from lmdeploy import turbomind as tm
self.engine = tm.TurboMind.from_pretrained(
model_path,
engine_config=backend_config,
chat_template_config=chat_template_config,
AllentDan marked this conversation as resolved.
Show resolved Hide resolved
**kwargs)
model_path, engine_config=backend_config, **kwargs)
self.backend_config = backend_config
self.hf_tm_cfg = self.engine.config

Expand Down
170 changes: 102 additions & 68 deletions lmdeploy/turbomind/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@
import random

from lmdeploy.messages import EngineGenerationConfig, TurbomindEngineConfig
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.model import MODELS, ChatTemplateConfig, best_match_model
from lmdeploy.serve.async_engine import deduce_a_name
from lmdeploy.tokenizer import DetokenizeState
from lmdeploy.utils import _stop_words

os.environ['TM_LOG_LEVEL'] = 'ERROR'
log_level = 'ERROR'
if os.getenv('TM_LOG_LEVEL') is None:
os.environ['TM_LOG_LEVEL'] = log_level
from lmdeploy.utils import get_logger
logger = get_logger('lmdeploy')
logger.setLevel(log_level)


def input_prompt(model_name):
Expand All @@ -20,24 +27,26 @@ def input_prompt(model_name):
return '\n'.join(iter(input, sentinel))


def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:
bstr = bstr.replace(invalid_char, b'')
ret = bstr.decode(encoding=coding, errors='ignore')
return ret


def main(model_path: str,
model_name: str = None,
session_id: int = 1,
top_k: float = 40,
top_p: float = 0.8,
temperature: float = 0.8,
repetition_penalty: float = 1.0,
cap: str = 'chat',
tp: int = 1,
max_batch_size: int = 1,
model_format: str = None,
quant_policy: int = 0,
cache_max_entry_count: float = 0.8,
cache_block_seq_len: int = 64,
rope_scaling_factor: float = 0.0,
enable_prefix_caching: bool = False,
session_len: int = None,
stream_output: bool = True,
request_output_len: int = 1024,
chat_template_cfg: ChatTemplateConfig = None,
chat_template_config: ChatTemplateConfig = None,
**kwargs):
"""An example to perform model inference through the command line
interface.
Expand All @@ -46,100 +55,125 @@ def main(model_path: str,
model_path (str): the path of the deployed model
model_name (str): the name of deployed model
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infilling', 'chat', 'python']
top_k (int): sampling top k.
top_p (int): sampling top p.
temperature (float): sampling temperature.
repetition_penalty (float): parameter to penalize repetition
cap (str): the capability of a model. For example, codellama has the ability among ['completion', 'infilling', 'chat', 'python']
tp (int): GPU number used in tensor parallelism
max_batch_size (int): max batch size
model_format (str): the layout of the deployed model. It can be one of the following values [hf, llama, awq]
quant_policy (int): default to 0. When k/v is quantized into 8 bit, set it to 4
cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
cache_block_seq_len (int): the length of the token sequence in a k/v block, default to 64
rope_scaling_factor (float): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
enable_prefix_caching (bool): whether enable prefix caching
session_len (int): the length input output tokens
stream_output (bool): indicator for streaming output or not
request_output_len (int): output token nums
chat_template_cfg (ChatTemplateConfig): Chat template config
**kwarg (dict): other arguments for initializing model's chat template
"""
from lmdeploy import turbomind as tm
if chat_template_cfg is None:
chat_template_cfg = ChatTemplateConfig(model_name=model_name,
capability=cap)
new_kwargs = {}
for k, v in kwargs.items():
if hasattr(chat_template_cfg, k):
setattr(chat_template_cfg, k, v)
else:
new_kwargs[k] = v
kwargs = new_kwargs
chat_template_config (ChatTemplateConfig): chat template config
kwargs (dict): unused args
""" # noqa: E 501

# chat template
model_name = deduce_a_name(model_path, model_name, None,
chat_template_config)
if model_name in MODELS.module_dict.keys():
chat_template_name = model_name
else:
chat_template_name = best_match_model(model_path)
if chat_template_config is None:
chat_template_config = ChatTemplateConfig(chat_template_name)
elif chat_template_config.model_name is None:
chat_template_config.model_name = chat_template_name
if chat_template_config.capability is None:
chat_template_config.capability = cap
print('chat_template_config:\n', chat_template_config, sep='', flush=True)
model = chat_template_config.chat_template

# engine
if session_len is None:
session_len = model.session_len

engine_cfg = TurbomindEngineConfig(
max_batch_size=max_batch_size,
irexyc marked this conversation as resolved.
Show resolved Hide resolved
model_name=model_name,
model_format=model_format,
session_len=session_len,
cache_max_entry_count=cache_max_entry_count,
cache_block_seq_len=cache_block_seq_len,
enable_prefix_caching=enable_prefix_caching,
quant_policy=quant_policy,
rope_scaling_factor=rope_scaling_factor,
tp=tp)
print('engine_cfg:\n', engine_cfg, sep='', flush=True)

engine_cfg = TurbomindEngineConfig(model_name=model_name, tp=tp)
for k, v in kwargs.items():
if hasattr(engine_cfg, k):
setattr(engine_cfg, k, v)
from lmdeploy import turbomind as tm
tm_model = tm.TurboMind.from_pretrained(model_path,
engine_config=engine_cfg)
generator = tm_model.create_instance()

tm_model = tm.TurboMind.from_pretrained(
model_path,
model_name=model_name,
engine_config=engine_cfg,
capability=cap,
chat_template_config=chat_template_cfg,
**kwargs)
# generateion config
tokenizer = tm_model.tokenizer
generator = tm_model.create_instance()
gen_config = EngineGenerationConfig(top_k=40)
stop_words = _stop_words(model.stop_words, tokenizer)
if stop_words is not None:
stop_words = stop_words[0][0].tolist()

gen_config = EngineGenerationConfig(max_new_tokens=request_output_len,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
stop_words=stop_words)

nth_round = 1
step = 0
seed = random.getrandbits(64)
model_name = tm_model.model_name
model = tm_model.model

print(f'session {session_id}')
while True:
prompt = input_prompt(model_name)
if prompt == 'exit':
exit(0)
elif prompt == 'end':
prompt = model.get_prompt('', nth_round == 1)
input_ids = tokenizer.encode(prompt)
for outputs in generator.stream_infer(
session_id=session_id,
input_ids=[input_ids],
request_output_len=request_output_len,
sequence_start=False,
sequence_end=True,
stream_output=stream_output):
pass
generator.end(session_id)
nth_round = 1
step = 0
seed = random.getrandbits(64)
else:
prompt = model.get_prompt(prompt, nth_round == 1)
input_ids = tokenizer.encode(prompt, nth_round == 1)
gen_config.random_seed = seed

if model.capability == 'chat':
sequence_start = (nth_round == 1)
sequence_end = False
step = step
else:
sequence_start = True
sequence_end = True
step = 0

if step + len(
input_ids) + request_output_len >= tm_model.session_len:
print('WARNING: exceed session max length.'
' Please end the session.')
continue

sequence_start = (nth_round == 1)
sequence_end = False
if cap != 'chat': # not interactive for other capability
sequence_start, sequence_end = True, True
step = 0

print(f'{prompt}', end='', flush=True)
state = DetokenizeState(len(input_ids))
for outputs in generator.stream_infer(
session_id=session_id,
input_ids=[input_ids],
gen_config=gen_config,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
stream_output=stream_output,
gen_config=gen_config,
ignore_eos=False,
random_seed=seed if nth_round == 1 else None):
stream_output=stream_output):

res, tokens = input_ids + outputs.token_ids, outputs.num_token
# decode res
response, state = tokenizer.detokenize_incrementally(
res, state=state)
response = valid_str(response)
print(f'{response}', end='', flush=True)
print(response, end='', flush=True)

# update step
step += len(input_ids) + tokens
Expand Down
Loading
Loading