Photon-AI-Research · jkelling · Apr 26, 2024
diff --git a/requirements_jureca.txt b/requirements_jureca.txt
@@ -0,0 +1,10 @@
+FrEIA @ git+https://github.com/VLL-HD/FrEIA.git@1779d1fba1e21000fda1927b59eeac0a6fcaa284
+geomloss>=0.2.6
+openPMD-api @ git+https://github.com/franzpoeschel/openPMD-api.git@pic_env
+torch
+wandb
+sympy
+scipy
+numpy
+matplotlib
+nflows
diff --git a/scripts/job_jureca.sh b/scripts/job_jureca.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBTACH -c 4
+#SBATCH -p dc-gpu
+#SBATCH --gres=gpu:1
+#SBATCH -n 1
+#SBATCH --ntasks=1 
+#SBATCH -A training2406
+#SBATCH -t 2:0:0 
+
+CINN=/p/home/jusers/kelling1/jureca/git/InSituML/main/ModelHelpers/cINN
+ROOT=/p/scratch/training2406/team_hechtlab_kelling/
+
+. $ROOT/env/profile
+. $ROOT/env/insituml/bin/activate
+
+export WORLD_SIZE=$SLURM_NTASKS
+export MASTER_PORT="1$( echo -n $SLURM_JOBID | tail -c 4 )"
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_ADDR=$master_addr
+
+echo HOSTNAME $HOSTNAME
+nvidia-smi
+
+export BATCH_SIZE=4
+echo BATCH_SIZE $BATCH_SIZE
+srun python $CINN/ac_jr_fp_ks_openpmd-streaming-continual-learning.py --io_config $CINN/io_config_jureca.py --model_config $CINN/model_config.py --runner srun
diff --git a/scripts/job_jureca_scan.sh b/scripts/job_jureca_scan.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#SBATCH -p dc-gpu
+#SBATCH -c 32
+#SBATCH --gres=gpu:4
+#SBATCH -n 1
+#SBATCH --ntasks=4 
+#SBATCH -A training2406
+#SBATCH -t 1:0:0 
+
+export PARAM_LIST_FILE="params.dat"
+export OBJ_LIST_FILE="objective.dat"
+
+SLURM_NTASKS=4
+
+export CINN=/p/home/jusers/kelling1/jureca/git/InSituML/main/ModelHelpers/cINN
+export ROOT=/p/scratch/training2406/team_hechtlab_kelling/
+
+. $ROOT/env/profile
+. $ROOT/env/insituml/bin/activate
+
+echo HOSTNAME $HOSTNAME
+nvidia-smi
+
+JOB_OFFSET=$(( $SLURM_ARRAY_TASK_ID * $SLURM_NTASKS ))
+
+for p in $( seq 1 $SLURM_NTASKS ); do
+
+	export TASK_ID=$(( $p - 1 ))
+	export CUDA_VISIBLE_DEVICES=$TASK_ID
+	export WORLD_SIZE=1
+	export MASTER_PORT="1$( echo -n $SLURM_JOBID | tail -c 3 )$TASK_ID"
+	master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+	export MASTER_ADDR=$master_addr
+
+	echo $CUDA_VISIBLE_DEVICES $MASTER_ADDR $MASTER_PORT
+
+	if [ -f $PARAM_LIST_FILE ]; then
+		LINE_NUM=$(( $JOB_OFFSET + p ))
+		if [ $( wc -l $PARAM_LIST_FILE ) -gt $LINE_NUM ]; then
+			break
+		fi
+		set $( head -n $LINE_NUM $PARAM_LIST_FILE | tail -n 1 )
+
+		export LAM_PREDICT=$1
+		export LAM_LATENT=$2
+		export LAM_REV=$3
+		export LAM_KL=$4
+		export LAM_RECON=$5
+		export LR_REST=$6
+		export LR_AE=$7
+	fi
+
+	srun -n 1 python $CINN/ac_jr_fp_ks_openpmd-streaming-continual-learning.py --io_config $CINN/io_config_jureca.py --model_config $CINN/model_config.py --runner srun >& slurm-$SLURM_JOBID-$SLURM_ARRAY_TASK_ID-${TASK_ID}.out &
+
+done
+
+wait
diff --git a/scripts/opt_jureca.sh b/scripts/opt_jureca.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH -p dc-gpu
+#SBATCH --gres=gpu:1
+#SBATCH -n 1
+#SBATCH --ntasks=1 
+#SBATCH -A training2406
+#SBATCH -t 0:30:0 
+
+PARAM_LIST_FILE="params.dat"
+OBJ_LIST_FILE="objective.dat"
+
+avg()
+{
+	if [ -z "$1" ]; then
+		n=1
+	else
+		n=$1
+	fi
+	awk "function isnum(x){return(x==x+0)} { if(isnum(\$$n)) { sum+=\$$n; sumsq+=\$$n*\$$n ; n+=1;} } END { print sum/n, sum, n, sqrt(sumsq/n - sum*sum/n/n) }"
+}
+
+
+if [ -f "objective.dat" ]; then
+	OBJ_JOBID=$( cat $OBJ_LIST_FILE )
+	for i in $( ls slurm-$OBJ_JOBID* ); do
+		echo $i
+	done
+
+	for i in slurm-$OBJ_JOBID*/loss_0.dat ; do
+		tail -n 20 $i | cut -f 
+	done
+fi
diff --git a/share/configs/io_config_jureca.py b/share/configs/io_config_jureca.py
@@ -0,0 +1,52 @@
+from os import environ
+
+modelPathPattern = 'trained_models/{}'
+
+#######################################
+## openPMD data loader configuration ##
+#######################################
+ps_dims = 6 # Actually used in the model configuration by now
+            # ToDo: Use in StreamingLoader
+
+number_of_particles = 4000
+
+
+streamLoader_config = dict(
+    t0 =  900,
+    t1 = 901, # endpoint=false, t1 is not used in training
+    # t0 =  1800,
+    # t1 = 1810, # endpoint=false, t1 is not used in training
+    streaming_config = None,
+    pathpattern1 = "/p/scratch/training2406/team_hechtlab_kelling/04-01_1013/simOutput/openPMD/simData_%T.bp5", # files on hemera
+    pathpattern2 = "/p/scratch/training2406/team_hechtlab_kelling/04-01_1013/simOutput/radiationOpenPMD/e_radAmplitudes_%T.bp5", # files on hemera
+    amplitude_direction=2, # choose single direction along which the radiation signal is observed, max: N_observer-1, where N_observer is defined in PIConGPU's radiation plugin
+    phase_space_variables = ["momentum", "force"], # allowed are "position", "momentum", and "force". If "force" is set, "momentum" needs to be set too.
+    number_particles_per_gpu = 30000,
+    verbose=False,
+    ## offline training params
+    num_epochs = 2
+)
+
+openPMD_queue_size=8
+
+batch_size=int(environ["BATCH_SIZE"]) if "BATCH_SIZE" in environ else 4
+
+trainBatchBuffer_config = dict(
+    training_bs=batch_size,
+    continual_bs=batch_size-1, # 7 is the max we can fit on P100 with our stupid chamfer's impl
+    stall_loader=True,
+    consume_size=1,
+    min_tb_from_unchanged_now_bf = 4,
+    #Train buffer.
+    buffersize = 10,
+    #long buffer
+    cl_mem_size = 20*32*3, # 20% of data, but all:1, so 32 blocks go to one rank
+)
+modelTrainer_config = dict(
+    checkpoint_interval = 1000,
+    checkpoint_final = True,
+    out_prefix = "slurm-{}-{}-{}/".format(environ.get("SLURM_JOBID", ""), environ.get("SLURM_ARRAY_TASK_ID", ""), environ.get("TASK_ID", 0))
+)
+
+runner="mpirun"
+type_streamer="streaming"
diff --git a/share/configs/model_config_opt.py b/share/configs/model_config_opt.py
@@ -0,0 +1,82 @@
+from math import sqrt
+from os import environ
+import pathlib
+
+#########################
+## Model configuration ##
+#########################
+
+rad_dims = 512 # Number of frequencies in radiation data
+
+latent_space_dims = 544
+
+l_predict = environ.get("LAM_PREDICT", None)
+l_latent = environ.get("LAM_LATENT", None)
+l_rev = environ.get("LAM_REV", None)
+l_kl = environ.get("LAM_KL", None)
+l_recon = environ.get("LAM_RECON", None)
+
+lambd_IM = 0.001
+lambd_predict = ( 3. if l_predict is None else float(l_predict) ) * lambd_IM
+lambd_latent = ( 300. if l_latent is None else float(l_latent) ) * lambd_IM
+lambd_rev = ( 400. if l_rev is None else float(l_rev) ) * lambd_IM
+lambd_AE = 1.0 if l_recon is None else float(l_recon)
+lambd_kl = ( 0.001 if l_kl is None else float(l_kl) ) / lambd_AE
+lambd_IM = 1
+
+lr = float(environ.get("LR_REST", 0.0001))
+lr_ae = float(environ.get("LR_AE", 0.0005))
+
+config = dict(
+dim_input = 1024,
+dim_condition = rad_dims,
+num_coupling_layers = 4,
+hidden_size = 256,
+num_blocks_mat = 6,
+activation = 'gelu',
+lr = lr,
+lrAEmult = (lr_ae / lr),
+y_noise_scale = 1e-1,
+zeros_noise_scale = 5e-2,
+lambd_predict = lambd_predict,
+lambd_latent = lambd_latent,
+lambd_rev = lambd_rev,
+lambd_kl = lambd_kl,
+lambd_AE = lambd_AE,
+lambd_IM = lambd_IM,
+ndim_tot = 544,
+ndim_x = 544,
+ndim_y = 512,
+ndim_z = 32,
+load_model = None, #'inn_vae_latent_544_sim007_24k0zbm4/best_model_',
+load_model_checkpoint = None, #'inn_vae_latent_544_sim014_859eopan/model_150', #'inn_vae_latent_544_sim014_859eopan/model_950',
+
+#   "earthmovers"
+#   "chamfersloss"
+#   "chamfersloss_d"
+#   "chamfersloss_o"
+## for optimized chamfer distance
+loss_function = 'chamfersloss',
+loss_kwargs = {},
+
+## for emd without peops library.
+# loss_function = 'earthmovers',
+# loss_kwargs = {},
+
+betas = (0.8, 0.9),
+eps = 1e-6,
+weight_decay = 2e-5,
+lr_annealingRate = None,
+lr_scaling = ( lambda x : sqrt(x) )
+)
+
+config_inn = dict(
+
+)
+
+normalization_values = dict(
+    momentum_mean = 1.2091940752668797e-08,
+    momentum_std = 0.11923234769525472,
+    force_mean = -2.7682006649827533e-09,
+    force_std = 7.705477610810592e-05
+)
diff --git a/share/env/profile_jureca b/share/env/profile_jureca
@@ -0,0 +1,2 @@
+export openPMD_USE_MPI=ON
+module load Python/3.11.3 GCC/12.3.0  BullMPI/4.1.4 ADIOS2/2.9.2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		export openPMD_USE_MPI=ON
		module load Python/3.11.3 GCC/12.3.0 BullMPI/4.1.4 ADIOS2/2.9.2