-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathsgcr_python.sh
More file actions
46 lines (35 loc) · 1.29 KB
/
sgcr_python.sh
File metadata and controls
46 lines (35 loc) · 1.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/bin/sh
#$ -cwd
#$ -l node_f=1
#$ -l h_rt=0:24:00:00
#$ -o outputs/llm-filter/llama-3.3-70b-instruct/$JOB_ID.log
#$ -e outputs/llm-filter/llama-3.3-70b-instruct/$JOB_ID.log
#$ -p -3
# priority: -5: normal, -4: high, -3: highest
# Load modules
module use /gs/fs/tga-NII-LLM/modules/modulefiles
module load ylab/cuda/12.4
module load ylab/cudnn/9.1.0
module load ylab/nccl/cuda-12.4/2.21.5
module load ylab/hpcx/2.17.1
module load ninja/1.11.1
source .env/bin/activate
INPUT_DIR="/gs/bs/tga-NII-LLM/datasets/raw/pretrain/swallow-code-v0.1-1-split-jsonl"
OUTPUT_DIR="/gs/bs/tga-NII-LLM/datasets/raw/pretrain/swallow-code-v0.3.1-jsonl"
mkdir -p "$OUTPUT_DIR"
INDEX=$1
FORMATTED_INDEX=$(printf "%04d" $INDEX)
BATCH_SIZE=2048
echo "batch size: $BATCH_SIZE"
export TMPDIR="/gs/bs/tge-gc24sp03/cache"
export TMP="/gs/bs/tge-gc24sp03/cache"
export VLLM_USE_V1=1
export VLLM_WORKER_MULTIPROC_METHOD=spawn
# https://github.com/vllm-project/vllm/issues/6152#issuecomment-2211709345
python src/code/pretrain_dataset/sgcr_python.py \
--model-path "/gs/bs/tga-NII-LLM/hf-checkpoints/Llama-3.3-70B-Instruct" \
--jsonl-path "$INPUT_DIR/split_$FORMATTED_INDEX.jsonl" \
--output-path "$OUTPUT_DIR/python_scoring_Llama-3.3-70B-split_$FORMATTED_INDEX.jsonl" \
--tensor-parallel 4 \
--resume \
--batch-size $BATCH_SIZE