diff --git a/config.pbtxt b/config.pbtxt index 130a973..e3fefb8 100644 --- a/config.pbtxt +++ b/config.pbtxt @@ -1,66 +1,110 @@ -# Triton Backend for vLLM. -backend: "vllm" -platform: "vllm" +# Triton Backend for TransformerLLM. +backend: "python" max_batch_size: 0 +# Triton should expect as input a single string +# input of variable length named 'text_input' input [ + { name: "text_input" data_type: TYPE_STRING dims: [ 1 ] + + }, { name: "max_length" data_type: TYPE_INT32 dims: [ 1 ] + + }, { name: "max_new_tokens" data_type: TYPE_INT32 dims: [ 1 ] + + }, { name: "do_sample" data_type: TYPE_BOOL dims: [ 1 ] + + }, { name: "top_k" data_type: TYPE_INT32 dims: [ 1 ] + + }, { name: "top_p" data_type: TYPE_FP32 dims: [ 1 ] + + }, { name: "temperature" data_type: TYPE_FP32 dims: [ 1 ] + + }, { name: "repetition_penalty" data_type: TYPE_FP32 dims: [ 1 ] + + }, { name: "stream" data_type: TYPE_BOOL dims: [ 1 ] + + } + ] + +# Triton should expect to respond with a single string +# output of variable length named 'text_output' output [ + { name: "text_output" data_type: TYPE_STRING dims: [ 1 ] + } + ] - - +parameters: [ + { + key: "base_model_path", + value: {string_value: "/cheetah/input/model/groupuser/TinyLlama-1.1B-Chat-v1.0"} + }, + { + key: "is_adapter_model", + value: {string_value: "false"} + }, + { + key: "adapter_model_path", + value: {string_value: ""} + }, + + { + key: "quantization", + value: {string_value: "none"} + } +] instance_group [ { @@ -68,3 +112,4 @@ instance_group [ count: 1 } ] +