Compare commits

..

No commits in common. "refs/deployment/triton" and "main" have entirely different histories.

7 changed files with 108 additions and 395 deletions

36
.gitattributes vendored Normal file

@ -0,0 +1,36 @@
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
Qwen3-4B-Q4_K_M.gguf filter=lfs diff=lfs merge=lfs -text

@ -1,260 +0,0 @@
"""
[Transformer-LLM 백엔드 가이드]
파일은 NVIDIA Triton Server에서 Hugging Face `AutoModelForCausalLM` 기반 모델을 손쉽게 배포하기 위해 제공되는 커스텀 Python 백엔드 템플릿입니다.
1. 모델 호환성
- Hugging Face의 `AutoModelForCausalLM` 클래스와 호환되는 모든 Causal Language Model을 지원합니다.
- [확인] 배포할 모델 `config.json` `architectures` 항목이 `...ForCausalLM` 형식인지 확인.
- [확인] 모델이 Hugging Face 공식 문서의 AutoModelForCausalLM이 지원하는 모델인지 확인.
(https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoModelForCausalLM.from_pretrained)
2. 토크나이저 호환성
- `AutoTokenizer` 호환되는 토크나이저를 지원하며, 모델과 동일한 경로에서 자동으로 로드됩니다.
3. 커스터마이징 안내
- 템플릿은 범용적인 사용을 위해 작성되었습니다.
- 특정 모델의 동작 방식이나 예외 처리가 필요한 경우, 파일(`model.py`) 설정 파일(`config.pbtxt`) 직접 수정하여 사용하시기 바랍니다.
"""
import json
import torch
import numpy as np
import triton_python_backend_utils as pb_utils
import uuid
import transformers
from typing import List, Dict, Any, Union, Tuple
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
GenerationConfig,
BitsAndBytesConfig,
)
from peft import PeftModel, PeftConfig
class TritonPythonModel:
def initialize(self, args: Dict[str, str]):
"""
모델 초기화: 라이브러리 버전 확인 모델/토크나이저 로드
"""
self.logger = pb_utils.Logger
self.model_config = json.loads(args["model_config"])
self.model_name = args["model_name"]
# 1. 라이브러리 버전 로그 추가
# GGUF 로드를 위해서는 최소 4.40.0 이상을 권장합니다.
transformers_version = transformers.__version__
self.logger.log_info(f"================ {self.model_name} Setup ================")
self.logger.log_info(f"Transformers Version: {transformers_version}")
self.logger.log_info(f"Torch Version: {torch.__version__}")
# 설정 파라미터 로드
self.base_model_path = self._get_config_param("base_model_path")
self.gguf_filename = self._get_config_param("gguf_filename")
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.logger.log_info(f"Base Model Path: {self.base_model_path}")
self.logger.log_info(f"GGUF Filename: {self.gguf_filename}")
self.logger.log_info(f"Device: {self.device}")
# 2. 모델 및 토크나이저 로드 실행
self._load_model_and_tokenizer()
self.logger.log_info(f"Model initialized successfully.")
def _load_model_and_tokenizer(self):
"""
config.pbtxt의 파라미터를 사용하여 GGUF 모델을 로드합니다.
Transformers 라이브러리가 GGUF를 읽어 fp16으로 역양자화합니다.
"""
# 1. config.pbtxt에서 설정값 읽기
load_path = self.base_model_path # /cheetah/input/model/groupuser/Qwen3-4B-Instruct-2507-mahjong-alpha
gguf_file = self._get_config_param("gguf_filename") # Qwen3-4B-Instruct-2507-mahjong-alpha.gguf
self.logger.log_info(f"Loading GGUF from: {load_path}/{gguf_file}")
try:
# 2. Tokenizer 로드 (GGUF 파일 내의 토크나이저 메타데이터 참조)
self.tokenizer = AutoTokenizer.from_pretrained(
load_path,
gguf_file=gguf_file,
trust_remote_code=True
)
# 3. Model 로드 (GGUF -> PyTorch fp16 변환)
# 주의: GGUF 로드 시 bnb_config(int4/8)와 중복 사용은 불가능할 수 있습니다.
self.model = AutoModelForCausalLM.from_pretrained(
load_path,
gguf_file=gguf_file,
torch_dtype=torch.float16,
device_map="auto",
local_files_only=True,
trust_remote_code=True
)
self.model.eval()
# 패딩 토큰 설정
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
self.supports_chat_template = (
hasattr(self.tokenizer, "chat_template") and
self.tokenizer.chat_template is not None
)
self.logger.log_info("GGUF Model and Tokenizer loaded successfully via Transformers.")
except Exception as e:
self.logger.log_error(f"Failed to load GGUF model: {e}")
raise e
def _get_bnb_config(self) -> Union[BitsAndBytesConfig, None]:
if self.quantization == "int4":
return BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
elif self.quantization == "int8":
return BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0,
llm_int8_has_fp16_weight=True
)
return None
def execute(self, requests):
"""Triton Inference Request 처리 메인 루프"""
responses = []
for request in requests:
# [ID 생성 로직] - 로그 추적용으로 유지 (Response에는 포함 X)
request_id = request.request_id()
if not request_id:
request_id = str(uuid.uuid4())
try:
# 1. 입력 데이터 파싱
input_data, is_chat = self._parse_input(request)
# [LOGGING] Request ID 포함하여 로그 출력
log_input_str = json.dumps(input_data, ensure_ascii=False) if isinstance(input_data, (list, dict)) else str(input_data)
self.logger.log_info(f"\n[RID: {request_id}] >>> [{'CHAT' if is_chat else 'TEXT'}][Input]: {log_input_str}")
# 2. Generation Config 생성
gen_config = self._create_generation_config(request)
# 3. 토크나이징
inputs = self._tokenize(input_data, is_chat)
# 4. 모델 추론 (Generate)
output_text = self._generate(inputs, gen_config)
# [LOGGING] Request ID 포함하여 결과 출력
self.logger.log_info(f"\n[RID: {request_id}] <<< [Output]: {output_text}")
# 5. 응답 생성
responses.append(self._create_response(output_text, request_id))
except Exception as e:
self.logger.log_error(f"[RID: {request_id}] Error during execution: {e}")
err_tensor = pb_utils.Tensor("text_output", np.array([str(e).encode('utf-8')], dtype=np.bytes_))
responses.append(pb_utils.InferenceResponse(output_tensors=[err_tensor]))
return responses
def _parse_input(self, request) -> Tuple[Union[str, List[Dict]], bool]:
input_text = self._get_input_scalar(request, "text_input")
try:
conversation = json.loads(input_text)
if isinstance(conversation, list):
return conversation, True
except (json.JSONDecodeError, TypeError):
pass
return input_text, False
def _tokenize(self, input_data, is_chat: bool):
if self.supports_chat_template and is_chat:
return self.tokenizer.apply_chat_template(
input_data,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True
).to(self.device)
else:
if is_chat:
input_data = str(input_data)
return self.tokenizer(input_data, return_tensors="pt").to(self.device)
def _generate(self, inputs, gen_config: GenerationConfig) -> str:
input_ids = inputs["input_ids"]
input_len = input_ids.shape[-1]
with torch.no_grad():
outputs = self.model.generate(
**inputs,
generation_config=gen_config,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id
)
generated_tokens = outputs[0][input_len:]
decoded_output = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
return decoded_output.strip()
def _create_generation_config(self, request) -> GenerationConfig:
def get_param(name, default=None, cast_type=None):
val = self._get_input_scalar(request, name, default)
if val is not None and cast_type:
return cast_type(val)
return val
return GenerationConfig(
max_length=get_param("max_length", 1024, int),
max_new_tokens=get_param("max_new_tokens", 256, int),
temperature=get_param("temperature", 1.0, float),
do_sample=get_param("do_sample", False, bool),
top_k=get_param("top_k", 50, int),
top_p=get_param("top_p", 1.0, float),
repetition_penalty=get_param("repetition_penalty", 1.0, float),
)
def _create_response(self, output_text: str, request_id: str):
"""생성된 텍스트를 Triton Response 객체로 변환"""
output_tensor = pb_utils.Tensor(
"text_output",
np.array([output_text.encode('utf-8')], dtype=np.bytes_)
)
return pb_utils.InferenceResponse(output_tensors=[output_tensor])
def _get_config_param(self, key: str, default: str = None) -> str:
params = self.model_config.get('parameters', {})
if key in params:
return params[key].get('string_value', default)
return default
def _get_input_scalar(self, request, name: str, default=None):
tensor = pb_utils.get_input_tensor_by_name(request, name)
if tensor is None:
return default
return self._np_decoder(tensor.as_numpy()[0])
def _np_decoder(self, obj):
if isinstance(obj, bytes):
return obj.decode('utf-8')
if np.issubdtype(obj, np.integer):
return int(obj)
if np.issubdtype(obj, np.floating):
return round(float(obj), 3)
if isinstance(obj, np.bool_):
return bool(obj)
def finalize(self):
self.logger.log_info(f"Finalizing model {self.model_name}")
self.model = None
self.tokenizer = None
torch.cuda.empty_cache()

BIN
Qwen3-4B-Q4_K_M.gguf (Stored with Git LFS) Normal file

Binary file not shown.

35
README.md Normal file

@ -0,0 +1,35 @@
---
license: apache-2.0
inference: false
base_model: Qwen/Qwen3-4B-Instruct
base_model_relation: quantized
tags: [green, llmware-chat, p4, gguf,emerald]
---
# qwen3-4b-instruct-gguf
**qwen3-4b-instruct-gguf** is a GGUF Q4_K_M int4 quantized version of [Qwen3-4B-Instruct](https://www.huggingface.co/Qwen/Qwen3-4B-Instruct), providing a very fast inference implementation, optimized for AI PCs.
This is from the latest release series from Qwen, and has 'thinking' capability expressed as 'think' tokens.
This model will run on an AI PC with at least 16 GB of memory.
### Model Description
- **Developed by:** Qwen
- **Model type:** qwen3
- **Parameters:** 4 billion
- **Model Parent:** Qwen/Qwen3-4B-Instruct
- **Language(s) (NLP):** English
- **License:** Apache 2.0
- **Uses:** Chat, general-purpose LLM
- **Quantization:** int4
## Model Card Contact
[llmware on github](https://www.github.com/llmware-ai/llmware)
[llmware on hf](https://www.huggingface.co/llmware)
[llmware website](https://www.llmware.ai)

30
config.json Normal file

@ -0,0 +1,30 @@
{
"architectures": [
"Qwen3ForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 2560,
"initializer_range": 0.02,
"intermediate_size": 9728,
"max_position_embeddings": 40960,
"max_window_layers": 36,
"model_type": "qwen3",
"num_attention_heads": 32,
"num_hidden_layers": 36,
"num_key_value_heads": 8,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 1000000,
"sliding_window": null,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.51.0",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
}

@ -1,135 +0,0 @@
# Triton Backend for TransformerLLM.
backend: "python"
max_batch_size: 0
# Triton should expect as input a single string
# input of variable length named 'text_input'
input [
{
name: "text_input"
data_type: TYPE_STRING
dims: [ 1 ]
},
{
name: "max_length"
data_type: TYPE_INT32
dims: [ 1 ]
optional: true
},
{
name: "max_new_tokens"
data_type: TYPE_INT32
dims: [ 1 ]
optional: true
},
{
name: "do_sample"
data_type: TYPE_BOOL
dims: [ 1 ]
optional: true
},
{
name: "top_k"
data_type: TYPE_INT32
dims: [ 1 ]
optional: true
},
{
name: "top_p"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
optional: true
},
{
name: "stream"
data_type: TYPE_BOOL
dims: [ 1 ]
optional: true
}
]
# Triton should expect to respond with a single string
# output of variable length named 'text_output'
output [
{
name: "text_output"
data_type: TYPE_STRING
dims: [ 1 ]
}
]
parameters: [
{
key: "base_model_path",
value: {string_value: "/cheetah/input/model/groupuser/qwen3-4b-instruct-gguf"}
},
{
key: "gguf_filename",
value: {string_value: "Qwen3-4B-Q4_K_M.gguf"}
},
{
key: "is_adapter_model",
value: {string_value: "false"}
},
{
key: "adapter_model_path",
value: {string_value: ""}
},
{
key: "quantization",
value: {string_value: "none"}
}
]
instance_group [
{
kind: KIND_AUTO
count: 1
}
]

4
hash_record_sha256.json Normal file

@ -0,0 +1,4 @@
{
"Qwen3-4B-Q4_K_M.gguf": "7485fe6f11af29433bc51cab58009521f205840f5b4ae3a32fa7f92e8534fdf5",
"time_stamp": "2025-07-05_070138"
}