Skip to content

Commit cb1b4a3

Browse files
authored
Add LongBench v2 support for DeepSeek (#2410)
* Add LongBench v2 support for DeepSeek - Add long-bench-eval dependency to requirements.txt - Refactor run_evaluation.sh to support both standard and LongBench v2 tasks - Add dynamic max_length configuration (40960 for longbench tasks) - Implement vLLM server-based evaluation for LongBench - Add helper functions for server lifecycle management - Support up to 40K context length evaluation with 512 threads Signed-off-by: yiliu30 <yi4.liu@intel.com> * update max len Signed-off-by: yiliu30 <yi4.liu@intel.com> --------- Signed-off-by: yiliu30 <yi4.liu@intel.com>
1 parent dd901fa commit cb1b4a3

File tree

2 files changed

+128
-18
lines changed

2 files changed

+128
-18
lines changed

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/requirements.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,6 @@ lm-eval==0.4.10
22
loguru
33
compressed-tensors==0.12.2
44
hf_transfer
5-
transformers==4.57.3
5+
transformers==4.57.3
6+
# pip install git+https://github.com/yiliu30/long-bench-eval
7+
long-bench-eval @ git+https://github.com/yiliu30/long-bench-eval

examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/deepseek/run_evaluation.sh

Lines changed: 125 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,27 @@ fi
7878
# Extract model name and set output directory
7979
MODEL_NAME=$(basename ${MODEL_PATH})
8080
OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval"
81-
8281
# Create output directory
8382
mkdir -p ${OUTPUT_DIR}
83+
84+
85+
SERVER_PORT=8000
86+
max_length=8192
87+
max_gen_toks=2048
88+
89+
# update max_length based on the task
90+
if [[ "$TASK_NAME" == *"longbench"* ]]; then
91+
max_length=131072 # 128k
92+
max_gen_toks=2048
93+
fi
94+
95+
max_ctx_length=$((max_length - max_gen_toks))
96+
97+
echo "max_length: ${max_length}"
98+
echo "max_gen_toks: ${max_gen_toks}"
99+
echo "max_ctx_length: ${max_ctx_length}"
100+
101+
84102
#FIXME: (yiliu30) remove these envs once we have fixed the pynccl issues
85103
export NCCL_NVLS_ENABLE=0
86104
# export VLLM_DISABLE_PYNCCL=1
@@ -141,19 +159,109 @@ echo "Tensor parallelism size: ${TP_SIZE}"
141159
echo "Batch size: ${BATCH_SIZE}"
142160
echo "Output directory: ${OUTPUT_DIR}"
143161

144-
VLLM_WORKER_MULTIPROC_METHOD=spawn \
145-
VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \
146-
VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \
147-
VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \
148-
VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \
149-
VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \
150-
VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \
151-
VLLM_ENABLE_V1_MULTIPROCESSING=1 \
152-
lm_eval --model vllm \
153-
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
154-
--tasks $TASK_NAME \
155-
--batch_size $BATCH_SIZE \
156-
--log_samples \
157-
--seed 42 \
158-
--output_path ${OUTPUT_DIR} \
159-
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
162+
163+
164+
# Export vLLM environment variables
165+
export VLLM_WORKER_MULTIPROC_METHOD=spawn
166+
export VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT
167+
export VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE
168+
export VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8
169+
export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS
170+
export VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE
171+
export VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM
172+
export VLLM_ENABLE_V1_MULTIPROCESSING=0
173+
174+
175+
176+
# Function to run standard lm-eval tasks
177+
run_standard_eval() {
178+
lm_eval --model vllm \
179+
--model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \
180+
--tasks $TASK_NAME \
181+
--batch_size $BATCH_SIZE \
182+
--log_samples \
183+
--seed 42 \
184+
--output_path ${OUTPUT_DIR} \
185+
--show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt
186+
}
187+
188+
# Function to start vLLM server
189+
start_vllm_server() {
190+
echo "Starting vLLM server on port ${SERVER_PORT}..."
191+
vllm serve ${MODEL_PATH} \
192+
--port ${SERVER_PORT} \
193+
--tensor-parallel-size ${TP_SIZE} \
194+
--max-model-len ${max_length} \
195+
--gpu-memory-utilization 0.8 \
196+
--dtype bfloat16 \
197+
--kv-cache-dtype ${KV_CACHE_DTYPE} \
198+
--disable-log-requests \
199+
> ${OUTPUT_DIR}/vllm_server.log 2>&1 &
200+
201+
VLLM_PID=$!
202+
echo "vLLM server started with PID: ${VLLM_PID}"
203+
}
204+
205+
# Function to wait for vLLM server to be ready
206+
wait_for_server() {
207+
local max_retries=300
208+
local retry_count=0
209+
210+
echo "Waiting for vLLM server to be ready..."
211+
while [ $retry_count -lt $max_retries ]; do
212+
if curl -s http://localhost:${SERVER_PORT}/health > /dev/null 2>&1; then
213+
echo "vLLM server is ready!"
214+
return 0
215+
fi
216+
retry_count=$((retry_count + 1))
217+
echo "Waiting for server... (${retry_count}/${max_retries})"
218+
sleep 5
219+
done
220+
221+
echo "Error: vLLM server failed to start within expected time"
222+
echo "Check ${OUTPUT_DIR}/vllm_server.log for details"
223+
return 1
224+
}
225+
226+
# Function to cleanup vLLM server on exit
227+
cleanup_server() {
228+
echo "Shutting down vLLM server..."
229+
kill $VLLM_PID 2>/dev/null || true
230+
wait $VLLM_PID 2>/dev/null || true
231+
echo "Server stopped"
232+
}
233+
234+
235+
236+
# Function to run longbench evaluation via API
237+
run_longbench_eval() {
238+
start_vllm_server
239+
240+
if ! wait_for_server; then
241+
kill $VLLM_PID 2>/dev/null || true
242+
exit 1
243+
fi
244+
245+
# Setup cleanup trap
246+
trap cleanup_server EXIT INT TERM
247+
248+
# Run LongBench evaluation
249+
echo "Running LongBench evaluation against vLLM server..."
250+
python -m long_bench_eval.cli \
251+
--api-key dummy \
252+
--base-url http://localhost:${SERVER_PORT}/v1 \
253+
--model ${MODEL_PATH} \
254+
--max-context-length ${max_ctx_length} \
255+
--num-threads 512
256+
257+
echo "Evaluation completed! Results saved to ${OUTPUT_DIR}"
258+
}
259+
260+
# Main evaluation logic
261+
if [[ "$TASK_NAME" == *"longbench"* ]]; then
262+
echo "Running LongBench v2 evaluation..."
263+
run_longbench_eval
264+
else
265+
echo "Running standard lm-eval tasks..."
266+
run_standard_eval
267+
fi

0 commit comments

Comments
 (0)