# Triton backend to use name: "base-gemma-3-1b-it" backend: "python" max_batch_size: 15 # Triton should expect as input a single string # input of variable length named 'text_input' input [ { name: "text_input" data_type: TYPE_STRING dims: [ -1 ] }, { name: "max_length" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "max_new_tokens" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "do_sample" data_type: TYPE_BOOL dims: [ 1 ] optional: true }, { name: "top_k" data_type: TYPE_INT32 dims: [ 1 ] optional: true }, { name: "top_p" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "temperature" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "repetition_penalty" data_type: TYPE_FP32 dims: [ 1 ] optional: true }, { name: "stream" data_type: TYPE_BOOL dims: [ 1 ] optional: true } ] # Triton should expect to respond with a single string # output of variable length named 'text_output' output [ { name: "text_output" data_type: TYPE_STRING dims: [ -1 ] } ] parameters: [ { key: "base_model_path", value: {string_value: "/cheetah/input/model/groupuser/base-gemma-3-1b-it"} }, { key: "is_adapter_model", value: {string_value: "false"} }, { key: "adapter_model_path", value: {string_value: ""} }, { key: "quantization", value: {string_value: "int8"} } ] instance_group [ { kind: KIND_AUTO count: 1 } ] # "model": { # "name": "Llama-3.2-1B-Instruct", # "backend": "TransformerLLM", # "tensorrtllm": { # "workers": 1, # "maxSeqLen": 1, # "kvCacheType": "paged", # "maxInputLen": 1024, # "maxNumTokens": 0 # }, # "maxBatchSize": 4, # "quantization": "int4", # "modelInstanceGroupKind": "KIND_GPU", # "modelInstanceGroupCount": 1 # }