启动服务
- 拉取官方镜像
docker pull nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3
- 运行容器,和之前教程一样,做目录映射,为qwen部署做好准备
docker run -d \
--name triton2 \
--net host \
--shm-size=2g \
--ulimit memlock=-1 \
--ulimit stack=67108864 \
--gpus all \
-v ${PWD}/tensorrtllm_backend:/tensorrtllm_backend \
-v ${PWD}/Qwen-7B-Chat-TensorRT-LLM/qwen:/root/qwen \
nvcr.io/nvidia/tritonserver:23.10-trtllm-python-py3 sleep 864000
- 进入容器,安装git-lfs
apt update
apt install git-lfs
- 安装TensorRT-LLM python版,方便待会编译Engine
pip install git+https://github.com/NVIDIA/TensorRT-LLM.git
- 复制lib库过去,否则无法运行
mkdir /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/
cp /opt/tritonserver/backends/tensorrtllm/* /usr/local/lib/python3.10/dist-packages/tensorrt_llm/libs/
- 重复之前的操作,安装qwen的依赖,编译Engine,推荐开启inflight-batching+smooth int8,参考命令
cd /root/qwen
pip install -r requirements.txt
python3 hf_qwen_convert.py --smoothquant=0.5
python3 build.py \
--use_smooth_quant \
--per_token \
--per_channel \
--use_inflight_batching \
--paged_kv_cache \
--remove_input_padding
- 复制Engine文件
cd /root/qwen/trt_engines/fp16/1-gpu/
mkdir /tensorrtllm_backend/triton_model_repo/tensorrt_llm/1/
cp -r ./* /tensorrtllm_backend/triton_model_repo/tensorrt_llm/1/
- 复制tokenzer文件
cd /root/qwen/
cp -r qwen_7b_chat /tensorrtllm_backend/triton_model_repo/tensorrt_llm/
- 启动服务
cd /tensorrtllm_backend
python3 scripts/launch_triton_server.py --world_size=1 --model_repo=/tensorrtllm_backend/triton_model_repo
调用服务
python客户端请求
- 安装python依赖
pip install tritonclient transformers gevent geventhttpclient tiktoken grpcio
- 运行
qwen/triton_client/inflight_batcher_llm_client.py
文件即可开启
cd /root/qwen/triton_client
python3 inflight_batcher_llm_client.py
- 测试结果
====================
Human: 你好
Output: 你好!有什么我可以帮助你的吗?
Human: 你叫什么?
Output: 我是来自阿里云的大规模语言模型,我叫通义千问。
http请求
curl -X POST localhost:8000/v2/models/ensemble/generate \
-d '{"text_input": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n你好,请问你叫什么?<|im_end|>\n<|im_start|>assistant\n", "max_tokens": 50, "bad_words": "", "stop_words": "", "end_id": [151643], "pad_id": [151643]}'
{
"model_name": "ensemble",
"model_version": "1",
"sequence_end": false,
"sequence_id": 0,
"sequence_start": false,
"text_output": "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n你好,请问你叫什么?<|im_end|>\n<|im_start|>assistant\n你好,我是通义千问,由阿里云开发的AI助手。<|im_end|>\n\n"
}
关闭服务
pgrep tritonserver | xargs kill -9