# Triton backend to use name: "Llama-3.2-1B-Instruct" backend: "python" max_batch_size: 4 # Triton should expect as input a single string # input of variable length named 'text_input' input [ { name: "input0" data_type: TYPE_FP32 dims: [ 1, 1, 1, 1 ] reshape { shape: [ 11, 11, 11 ] } }, { name: "input1" data_type: TYPE_UINT64 dims: [ 2, 2, 2, 2 ] reshape { shape: [ 22, 22, 22, 22 ] } } ] # Triton should expect to respond with a single string # output of variable length named 'text_output' output [ { name: "output0" data_type: TYPE_BOOL dims: [ 1 ] }, { name: "output1" data_type: TYPE_FP64 dims: [ 1, 100 ] reshape { shape: [ 100 ] } }, { name: "text_output" data_type: TYPE_STRING dims: [ -1 ] } ] parameters: [ { key: "base_model_path", value: {string_value: "/cheetah/input/model/groupuser/Llama-3.2-1B-Instruct"} }, { key: "is_adapter_model", value: {string_value: "false"} }, { key: "adapter_model_path", value: {string_value: ""} }, { key: "quantization", value: {string_value: "int4"} } ] instance_group [ { kind: KIND_GPU count: 1 } ]