Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions requirements_jureca.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FrEIA @ git+https://github.com/VLL-HD/FrEIA.git@1779d1fba1e21000fda1927b59eeac0a6fcaa284
geomloss>=0.2.6
openPMD-api @ git+https://github.com/franzpoeschel/openPMD-api.git@pic_env
torch
wandb
sympy
scipy
numpy
matplotlib
nflows
26 changes: 26 additions & 0 deletions scripts/job_jureca.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash
#SBTACH -c 4
#SBATCH -p dc-gpu
#SBATCH --gres=gpu:1
#SBATCH -n 1
#SBATCH --ntasks=1
#SBATCH -A training2406
#SBATCH -t 2:0:0

CINN=/p/home/jusers/kelling1/jureca/git/InSituML/main/ModelHelpers/cINN
ROOT=/p/scratch/training2406/team_hechtlab_kelling/

. $ROOT/env/profile
. $ROOT/env/insituml/bin/activate

export WORLD_SIZE=$SLURM_NTASKS
export MASTER_PORT="1$( echo -n $SLURM_JOBID | tail -c 4 )"
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr

echo HOSTNAME $HOSTNAME
nvidia-smi

export BATCH_SIZE=4
echo BATCH_SIZE $BATCH_SIZE
srun python $CINN/ac_jr_fp_ks_openpmd-streaming-continual-learning.py --io_config $CINN/io_config_jureca.py --model_config $CINN/model_config.py --runner srun
57 changes: 57 additions & 0 deletions scripts/job_jureca_scan.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
#SBATCH -p dc-gpu
#SBATCH -c 32
#SBATCH --gres=gpu:4
#SBATCH -n 1
#SBATCH --ntasks=4
#SBATCH -A training2406
#SBATCH -t 1:0:0

export PARAM_LIST_FILE="params.dat"
export OBJ_LIST_FILE="objective.dat"

SLURM_NTASKS=4

export CINN=/p/home/jusers/kelling1/jureca/git/InSituML/main/ModelHelpers/cINN
export ROOT=/p/scratch/training2406/team_hechtlab_kelling/

. $ROOT/env/profile
. $ROOT/env/insituml/bin/activate

echo HOSTNAME $HOSTNAME
nvidia-smi

JOB_OFFSET=$(( $SLURM_ARRAY_TASK_ID * $SLURM_NTASKS ))

for p in $( seq 1 $SLURM_NTASKS ); do

export TASK_ID=$(( $p - 1 ))
export CUDA_VISIBLE_DEVICES=$TASK_ID
export WORLD_SIZE=1
export MASTER_PORT="1$( echo -n $SLURM_JOBID | tail -c 3 )$TASK_ID"
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr

echo $CUDA_VISIBLE_DEVICES $MASTER_ADDR $MASTER_PORT

if [ -f $PARAM_LIST_FILE ]; then
LINE_NUM=$(( $JOB_OFFSET + p ))
if [ $( wc -l $PARAM_LIST_FILE ) -gt $LINE_NUM ]; then
break
fi
set $( head -n $LINE_NUM $PARAM_LIST_FILE | tail -n 1 )

export LAM_PREDICT=$1
export LAM_LATENT=$2
export LAM_REV=$3
export LAM_KL=$4
export LAM_RECON=$5
export LR_REST=$6
export LR_AE=$7
fi

srun -n 1 python $CINN/ac_jr_fp_ks_openpmd-streaming-continual-learning.py --io_config $CINN/io_config_jureca.py --model_config $CINN/model_config.py --runner srun >& slurm-$SLURM_JOBID-$SLURM_ARRAY_TASK_ID-${TASK_ID}.out &

done

wait
32 changes: 32 additions & 0 deletions scripts/opt_jureca.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH -p dc-gpu
#SBATCH --gres=gpu:1
#SBATCH -n 1
#SBATCH --ntasks=1
#SBATCH -A training2406
#SBATCH -t 0:30:0

PARAM_LIST_FILE="params.dat"
OBJ_LIST_FILE="objective.dat"

avg()
{
if [ -z "$1" ]; then
n=1
else
n=$1
fi
awk "function isnum(x){return(x==x+0)} { if(isnum(\$$n)) { sum+=\$$n; sumsq+=\$$n*\$$n ; n+=1;} } END { print sum/n, sum, n, sqrt(sumsq/n - sum*sum/n/n) }"
}


if [ -f "objective.dat" ]; then
OBJ_JOBID=$( cat $OBJ_LIST_FILE )
for i in $( ls slurm-$OBJ_JOBID* ); do
echo $i
done

for i in slurm-$OBJ_JOBID*/loss_0.dat ; do
tail -n 20 $i | cut -f
done
fi
52 changes: 52 additions & 0 deletions share/configs/io_config_jureca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from os import environ

modelPathPattern = 'trained_models/{}'

#######################################
## openPMD data loader configuration ##
#######################################
ps_dims = 6 # Actually used in the model configuration by now
# ToDo: Use in StreamingLoader

number_of_particles = 4000


streamLoader_config = dict(
t0 = 900,
t1 = 901, # endpoint=false, t1 is not used in training
# t0 = 1800,
# t1 = 1810, # endpoint=false, t1 is not used in training
streaming_config = None,
pathpattern1 = "/p/scratch/training2406/team_hechtlab_kelling/04-01_1013/simOutput/openPMD/simData_%T.bp5", # files on hemera
pathpattern2 = "/p/scratch/training2406/team_hechtlab_kelling/04-01_1013/simOutput/radiationOpenPMD/e_radAmplitudes_%T.bp5", # files on hemera
amplitude_direction=2, # choose single direction along which the radiation signal is observed, max: N_observer-1, where N_observer is defined in PIConGPU's radiation plugin
phase_space_variables = ["momentum", "force"], # allowed are "position", "momentum", and "force". If "force" is set, "momentum" needs to be set too.
number_particles_per_gpu = 30000,
verbose=False,
## offline training params
num_epochs = 2
)

openPMD_queue_size=8

batch_size=int(environ["BATCH_SIZE"]) if "BATCH_SIZE" in environ else 4

trainBatchBuffer_config = dict(
training_bs=batch_size,
continual_bs=batch_size-1, # 7 is the max we can fit on P100 with our stupid chamfer's impl
stall_loader=True,
consume_size=1,
min_tb_from_unchanged_now_bf = 4,
#Train buffer.
buffersize = 10,
#long buffer
cl_mem_size = 20*32*3, # 20% of data, but all:1, so 32 blocks go to one rank
)
modelTrainer_config = dict(
checkpoint_interval = 1000,
checkpoint_final = True,
out_prefix = "slurm-{}-{}-{}/".format(environ.get("SLURM_JOBID", ""), environ.get("SLURM_ARRAY_TASK_ID", ""), environ.get("TASK_ID", 0))
)

runner="mpirun"
type_streamer="streaming"
82 changes: 82 additions & 0 deletions share/configs/model_config_opt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from math import sqrt
from os import environ
import pathlib

#########################
## Model configuration ##
#########################

rad_dims = 512 # Number of frequencies in radiation data

latent_space_dims = 544

l_predict = environ.get("LAM_PREDICT", None)
l_latent = environ.get("LAM_LATENT", None)
l_rev = environ.get("LAM_REV", None)
l_kl = environ.get("LAM_KL", None)
l_recon = environ.get("LAM_RECON", None)

lambd_IM = 0.001
lambd_predict = ( 3. if l_predict is None else float(l_predict) ) * lambd_IM
lambd_latent = ( 300. if l_latent is None else float(l_latent) ) * lambd_IM
lambd_rev = ( 400. if l_rev is None else float(l_rev) ) * lambd_IM
lambd_AE = 1.0 if l_recon is None else float(l_recon)
lambd_kl = ( 0.001 if l_kl is None else float(l_kl) ) / lambd_AE
lambd_IM = 1

lr = float(environ.get("LR_REST", 0.0001))
lr_ae = float(environ.get("LR_AE", 0.0005))

config = dict(
dim_input = 1024,
dim_condition = rad_dims,
num_coupling_layers = 4,
hidden_size = 256,
num_blocks_mat = 6,
activation = 'gelu',
lr = lr,
lrAEmult = (lr_ae / lr),
y_noise_scale = 1e-1,
zeros_noise_scale = 5e-2,
lambd_predict = lambd_predict,
lambd_latent = lambd_latent,
lambd_rev = lambd_rev,
lambd_kl = lambd_kl,
lambd_AE = lambd_AE,
lambd_IM = lambd_IM,
ndim_tot = 544,
ndim_x = 544,
ndim_y = 512,
ndim_z = 32,
load_model = None, #'inn_vae_latent_544_sim007_24k0zbm4/best_model_',
load_model_checkpoint = None, #'inn_vae_latent_544_sim014_859eopan/model_150', #'inn_vae_latent_544_sim014_859eopan/model_950',

# "earthmovers"
# "chamfersloss"
# "chamfersloss_d"
# "chamfersloss_o"
## for optimized chamfer distance
loss_function = 'chamfersloss',
loss_kwargs = {},

## for emd without peops library.
# loss_function = 'earthmovers',
# loss_kwargs = {},

betas = (0.8, 0.9),
eps = 1e-6,
weight_decay = 2e-5,
lr_annealingRate = None,
lr_scaling = ( lambda x : sqrt(x) )
)

config_inn = dict(

)

normalization_values = dict(
momentum_mean = 1.2091940752668797e-08,
momentum_std = 0.11923234769525472,
force_mean = -2.7682006649827533e-09,
force_std = 7.705477610810592e-05
)
2 changes: 2 additions & 0 deletions share/env/profile_jureca
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
export openPMD_USE_MPI=ON
module load Python/3.11.3 GCC/12.3.0 BullMPI/4.1.4 ADIOS2/2.9.2