vLLM竞赛入门案例-含完整代码!
简介本文介绍Kaggle竞赛中使用vLLM进行模型预测的Baseline,对此感兴趣的朋友可以去下面链接中参赛https://www.kaggle.com/competitions/lmsys-chatbot-arena 跟着大神一起进步学习。
%%time
# 安装工具包
!pip uninstall -y torch
!pip install -U --no-index --find-links=/kaggle/input/vllm-whl -U vllm
!pip install -U --upgrade /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -U --upgrade /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl
import os, math, numpy as np
# 指定gpu
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
import vllm
llm = vllm.LLM(
"/kaggle/input/bagel-v3-343",
quantization="awq",
tensor_parallel_size=2,
gpu_memory_utilization=0.95,
trust_remote_code=True,
dtype="half",
enforce_eager=True,
max_model_len=1024,
#distributed_executor_backend="ray",
)
tokenizer = llm.get_tokenizer()
import pandas as pd
VALIDATE = 128
test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")
if len(test)==3:
test = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
test = test.iloc[:VALIDATE]
from typing import Any, Dict, List
from transformers import LogitsProcessor
import torch
choices = ["A","B","tie"]
KEEP = []
for x in choices:
c = tokenizer.encode(x,add_special_tokens=False)[0]
KEEP.append(c)
print(f"Force predictions to be tokens {KEEP} which are {choices}.")
class DigitLogitsProcessor(LogitsProcessor):
def __init__(self, tokenizer):
self.allowed_ids = KEEP
def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
scores[self.allowed_ids] += 100
return scores
SS = "#"*25 + "\n"
all_prompts = []
for index,row in test.iterrows():
a = " ".join(eval(row.prompt, {"null": ""}))
b = " ".join(eval(row.response_a, {"null": ""}))
c = " ".join(eval(row.response_b, {"null": ""}))
prompt = f"{SS}PROMPT: "+a+f"\n\n{SS}RESPONSE A: "+b+f"\n\n{SS}RESPONSE B: "+c+"\n\n"
formatted_sample = sys_prompt + "\n\n" + prompt
all_prompts.append( formatted_sample )
此处使用快速vLLM进行测试推断。我们要求vLLM输出认为应该在第一个标记中被预测的前5个标记的概率。我们还将预测限制在1个标记,以增加推理速度。
根据推断128个训练样本所需的速度,我们可以推断推断25,000个测试样本将需要多长时间。
from time import time
start = time()
logits_processors = [DigitLogitsProcessor(tokenizer)]
responses = llm.generate(
all_prompts,
vllm.SamplingParams(
n=1, # Number of output sequences to return for each prompt.
top_p=0.9, # Float that controls the cumulative probability of the top tokens to consider.
temperature=0, # randomness of the sampling
seed=777, # Seed for reprodicibility
skip_special_tokens=True, # Whether to skip special tokens in the output.
max_tokens=1, # Maximum number of tokens to generate per output sequence.
logits_processors=logits_processors,
logprobs = 5
),
use_tqdm = True
)
end = time()
elapsed = (end-start)/60. #minutes
print(f"Inference of {VALIDATE} samples took {elapsed} minutes!")
results = []
errors = 0
for i,response in enumerate(responses):
try:
x = response.outputs[0].logprobs[0]
logprobs = []
for k in KEEP:
if k in x:
logprobs.append( math.exp(x[k].logprob) )
else:
logprobs.append( 0 )
print(f"bad logits {i}")
logprobs = np.array( logprobs )
logprobs /= logprobs.sum()
results.append( logprobs )
except:
#print(f"error {i}")
results.append( np.array([1/3., 1/3., 1/3.]) )
errors += 1
print(f"There were {errors} inference errors out of {i+1} inferences")
results = np.vstack(results)
sub = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/sample_submission.csv")
if len(test)!=VALIDATE:
sub[["winner_model_a","winner_model_b","winner_tie"]] = results
sub.to_csv("submission.csv",index=False)
sub.head()