import triton_python_backend_utils as pb_utils from transformers import AutoTokenizer, AutoModel import torch class TritonPythonModel: def initialize(self, args): self.tokenizer = AutoTokenizer.from_pretrained("../../kicon_e5large_15_v1") self.model = AutoModel.from_pretrained("../../kicon_e5large_15_v1") self.model.eval() def execute(self, requests): responses = [] for request in requests: input_text = pb_utils.get_input_tensor_by_name(request, "text").as_numpy()[0].decode("utf-8") inputs = self.tokenizer(input_text, return_tensors="pt", truncation=True, padding=True) with torch.no_grad(): outputs = self.model(**inputs) embedding = outputs.last_hidden_state.mean(dim=1).numpy() out_tensor = pb_utils.Tensor("embedding", embedding) responses.append(pb_utils.InferenceResponse(output_tensors=[out_tensor])) return responses