|
78 | 78 | # Extract model name and set output directory |
79 | 79 | MODEL_NAME=$(basename ${MODEL_PATH}) |
80 | 80 | OUTPUT_DIR="${MODEL_NAME}-tp${TP_SIZE}-eval" |
81 | | - |
82 | 81 | # Create output directory |
83 | 82 | mkdir -p ${OUTPUT_DIR} |
| 83 | + |
| 84 | + |
| 85 | +SERVER_PORT=8000 |
| 86 | +max_length=8192 |
| 87 | +max_gen_toks=2048 |
| 88 | + |
| 89 | +# update max_length based on the task |
| 90 | +if [[ "$TASK_NAME" == *"longbench"* ]]; then |
| 91 | + max_length=131072 # 128k |
| 92 | + max_gen_toks=2048 |
| 93 | +fi |
| 94 | + |
| 95 | +max_ctx_length=$((max_length - max_gen_toks)) |
| 96 | + |
| 97 | +echo "max_length: ${max_length}" |
| 98 | +echo "max_gen_toks: ${max_gen_toks}" |
| 99 | +echo "max_ctx_length: ${max_ctx_length}" |
| 100 | + |
| 101 | + |
84 | 102 | #FIXME: (yiliu30) remove these envs once we have fixed the pynccl issues |
85 | 103 | export NCCL_NVLS_ENABLE=0 |
86 | 104 | # export VLLM_DISABLE_PYNCCL=1 |
@@ -141,19 +159,109 @@ echo "Tensor parallelism size: ${TP_SIZE}" |
141 | 159 | echo "Batch size: ${BATCH_SIZE}" |
142 | 160 | echo "Output directory: ${OUTPUT_DIR}" |
143 | 161 |
|
144 | | -VLLM_WORKER_MULTIPROC_METHOD=spawn \ |
145 | | -VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT \ |
146 | | -VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE \ |
147 | | -VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 \ |
148 | | -VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS \ |
149 | | -VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE \ |
150 | | -VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM \ |
151 | | -VLLM_ENABLE_V1_MULTIPROCESSING=1 \ |
152 | | -lm_eval --model vllm \ |
153 | | - --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ |
154 | | - --tasks $TASK_NAME \ |
155 | | - --batch_size $BATCH_SIZE \ |
156 | | - --log_samples \ |
157 | | - --seed 42 \ |
158 | | - --output_path ${OUTPUT_DIR} \ |
159 | | - --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt |
| 162 | + |
| 163 | + |
| 164 | +# Export vLLM environment variables |
| 165 | +export VLLM_WORKER_MULTIPROC_METHOD=spawn |
| 166 | +export VLLM_ENABLE_AR_EXT=$VLLM_ENABLE_AR_EXT |
| 167 | +export VLLM_AR_MXFP4_MODULAR_MOE=$VLLM_AR_MXFP4_MODULAR_MOE |
| 168 | +export VLLM_MXFP4_PRE_UNPACK_TO_FP8=$VLLM_MXFP4_PRE_UNPACK_TO_FP8 |
| 169 | +export VLLM_MXFP4_PRE_UNPACK_WEIGHTS=$VLLM_MXFP4_PRE_UNPACK_WEIGHTS |
| 170 | +export VLLM_ENABLE_STATIC_MOE=$VLLM_ENABLE_STATIC_MOE |
| 171 | +export VLLM_USE_DEEP_GEMM=$VLLM_USE_DEEP_GEMM |
| 172 | +export VLLM_ENABLE_V1_MULTIPROCESSING=0 |
| 173 | + |
| 174 | + |
| 175 | + |
| 176 | +# Function to run standard lm-eval tasks |
| 177 | +run_standard_eval() { |
| 178 | + lm_eval --model vllm \ |
| 179 | + --model_args "pretrained=${MODEL_PATH},tensor_parallel_size=${TP_SIZE},max_model_len=8192,max_num_batched_tokens=32768,max_num_seqs=128,add_bos_token=True,gpu_memory_utilization=0.8,dtype=bfloat16,max_gen_toks=2048,enable_prefix_caching=False,kv_cache_dtype=${KV_CACHE_DTYPE}" \ |
| 180 | + --tasks $TASK_NAME \ |
| 181 | + --batch_size $BATCH_SIZE \ |
| 182 | + --log_samples \ |
| 183 | + --seed 42 \ |
| 184 | + --output_path ${OUTPUT_DIR} \ |
| 185 | + --show_config 2>&1 | tee ${OUTPUT_DIR}/log.txt |
| 186 | +} |
| 187 | + |
| 188 | +# Function to start vLLM server |
| 189 | +start_vllm_server() { |
| 190 | + echo "Starting vLLM server on port ${SERVER_PORT}..." |
| 191 | + vllm serve ${MODEL_PATH} \ |
| 192 | + --port ${SERVER_PORT} \ |
| 193 | + --tensor-parallel-size ${TP_SIZE} \ |
| 194 | + --max-model-len ${max_length} \ |
| 195 | + --gpu-memory-utilization 0.8 \ |
| 196 | + --dtype bfloat16 \ |
| 197 | + --kv-cache-dtype ${KV_CACHE_DTYPE} \ |
| 198 | + --disable-log-requests \ |
| 199 | + > ${OUTPUT_DIR}/vllm_server.log 2>&1 & |
| 200 | + |
| 201 | + VLLM_PID=$! |
| 202 | + echo "vLLM server started with PID: ${VLLM_PID}" |
| 203 | +} |
| 204 | + |
| 205 | +# Function to wait for vLLM server to be ready |
| 206 | +wait_for_server() { |
| 207 | + local max_retries=300 |
| 208 | + local retry_count=0 |
| 209 | + |
| 210 | + echo "Waiting for vLLM server to be ready..." |
| 211 | + while [ $retry_count -lt $max_retries ]; do |
| 212 | + if curl -s http://localhost:${SERVER_PORT}/health > /dev/null 2>&1; then |
| 213 | + echo "vLLM server is ready!" |
| 214 | + return 0 |
| 215 | + fi |
| 216 | + retry_count=$((retry_count + 1)) |
| 217 | + echo "Waiting for server... (${retry_count}/${max_retries})" |
| 218 | + sleep 5 |
| 219 | + done |
| 220 | + |
| 221 | + echo "Error: vLLM server failed to start within expected time" |
| 222 | + echo "Check ${OUTPUT_DIR}/vllm_server.log for details" |
| 223 | + return 1 |
| 224 | +} |
| 225 | + |
| 226 | +# Function to cleanup vLLM server on exit |
| 227 | +cleanup_server() { |
| 228 | + echo "Shutting down vLLM server..." |
| 229 | + kill $VLLM_PID 2>/dev/null || true |
| 230 | + wait $VLLM_PID 2>/dev/null || true |
| 231 | + echo "Server stopped" |
| 232 | +} |
| 233 | + |
| 234 | + |
| 235 | + |
| 236 | +# Function to run longbench evaluation via API |
| 237 | +run_longbench_eval() { |
| 238 | + start_vllm_server |
| 239 | + |
| 240 | + if ! wait_for_server; then |
| 241 | + kill $VLLM_PID 2>/dev/null || true |
| 242 | + exit 1 |
| 243 | + fi |
| 244 | + |
| 245 | + # Setup cleanup trap |
| 246 | + trap cleanup_server EXIT INT TERM |
| 247 | + |
| 248 | + # Run LongBench evaluation |
| 249 | + echo "Running LongBench evaluation against vLLM server..." |
| 250 | + python -m long_bench_eval.cli \ |
| 251 | + --api-key dummy \ |
| 252 | + --base-url http://localhost:${SERVER_PORT}/v1 \ |
| 253 | + --model ${MODEL_PATH} \ |
| 254 | + --max-context-length ${max_ctx_length} \ |
| 255 | + --num-threads 512 |
| 256 | + |
| 257 | + echo "Evaluation completed! Results saved to ${OUTPUT_DIR}" |
| 258 | +} |
| 259 | + |
| 260 | +# Main evaluation logic |
| 261 | +if [[ "$TASK_NAME" == *"longbench"* ]]; then |
| 262 | + echo "Running LongBench v2 evaluation..." |
| 263 | + run_longbench_eval |
| 264 | +else |
| 265 | + echo "Running standard lm-eval tasks..." |
| 266 | + run_standard_eval |
| 267 | +fi |
0 commit comments