Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
### Project Specific ###
wandb
saved_models
lightning_logs
runs/
lightning_logs/
data
graphs
*.sif
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ This release introduces new features including GIF animation support, wandb run

### Changed

- Consolidate all training/evaluation run outputs (checkpoints, logger files, plots) into a single `runs/<run-name>/` directory instead of scattering across `saved_models/`, `lightning_logs/`, `wandb/`, and `mlruns/` [\#293](https://github.com/mllam/neural-lam/issues/293) @sudhansu-24
- Change the default ensemble-loading behavior in `WeatherDataset` / `WeatherDataModule` to use all ensemble members as independent samples for ensemble datastores (with matching ensemble-member selection for forcing when available); single-member behavior now requires explicitly opting in via `--load_single_member` [\#332](https://github.com/mllam/neural-lam/pull/332) @kshirajahere

- Refactor graph loading: move zero-indexing out of the model and update plotting to prepare using the research-branch graph I/O [\#184](https://github.com/mllam/neural-lam/pull/184) @zweihuehner
Expand Down
9 changes: 6 additions & 3 deletions neural_lam/custom_loggers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Standard library
import os
import sys

# Third-party
Expand All @@ -15,10 +16,11 @@ class CustomMLFlowLogger(pl.loggers.MLFlowLogger):
of version `2.0.3` at least.
"""

def __init__(self, experiment_name, tracking_uri, run_name):
def __init__(self, experiment_name, tracking_uri, run_name, save_dir=None):
super().__init__(
experiment_name=experiment_name, tracking_uri=tracking_uri
)
self._save_dir = save_dir or "mlruns"

mlflow.start_run(run_id=self.run_id, log_system_metrics=True)
mlflow.set_tag("mlflow.runName", run_name)
Expand All @@ -35,7 +37,7 @@ def save_dir(self):
str
Path to the directory where the artifacts are saved.
"""
return "mlruns"
return self._save_dir

def log_image(self, key, images, step=None):
"""
Expand All @@ -57,7 +59,8 @@ def log_image(self, key, images, step=None):

# Need to save the image to a temporary file, then log that file
# mlflow.log_image, should do this automatically, but is buggy
temporary_image = f"{key}.png"
os.makedirs(self.save_dir, exist_ok=True)
temporary_image = os.path.join(self.save_dir, f"{key}.png")
images[0].savefig(temporary_image)

img = Image.open(temporary_image)
Expand Down
8 changes: 6 additions & 2 deletions neural_lam/train_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Standard library
import json
import os
import random
import time
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
Expand Down Expand Up @@ -338,12 +339,14 @@ def main(input_args=None):
f"{time.strftime('%m_%d_%H')}-{random_run_id:04d}"
)

run_dir = os.path.join("runs", run_name)

training_logger = utils.setup_training_logger(
datastore=datastore, args=args, run_name=run_name
datastore=datastore, args=args, run_name=run_name, run_dir=run_dir
)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
dirpath=f"saved_models/{run_name}",
dirpath=os.path.join(run_dir, "checkpoints"),
filename="min_val_loss",
monitor="val_mean_loss",
mode="min",
Expand All @@ -352,6 +355,7 @@ def main(input_args=None):
trainer = pl.Trainer(
max_epochs=args.epochs,
deterministic=True,
default_root_dir=run_dir,
strategy="auto",
accelerator=device_name,
num_nodes=args.num_nodes,
Expand Down
4 changes: 3 additions & 1 deletion neural_lam/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ def init_training_logger_metrics(training_logger, val_steps):


@rank_zero_only
def setup_training_logger(datastore, args, run_name):
def setup_training_logger(datastore, args, run_name, run_dir):
"""Set up the training logger (WandB or MLFlow).

Parameters
Expand Down Expand Up @@ -520,6 +520,7 @@ def setup_training_logger(datastore, args, run_name):
config=dict(training=vars(args), datastore=datastore._config),
resume=wandb_resume,
id=args.wandb_id,
save_dir=run_dir,
)
elif args.logger == "mlflow":
if args.wandb_id is not None:
Expand All @@ -536,6 +537,7 @@ def setup_training_logger(datastore, args, run_name):
experiment_name=args.logger_project,
tracking_uri=url,
run_name=run_name,
save_dir=run_dir,
)
training_logger.log_hyperparams(
dict(training=vars(args), datastore=datastore._config)
Expand Down
8 changes: 6 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ def test_wandb_logger_kwargs(
datastore = MagicMock()
datastore._config = {}

setup_training_logger(datastore, args, run_name="my-run")
setup_training_logger(
datastore, args, run_name="my-run", run_dir="runs/my-run"
)

_, kwargs = mock_wandb.call_args
assert kwargs["resume"] == expected_resume
Expand Down Expand Up @@ -111,7 +113,9 @@ def test_wandb_id_ignored_with_mlflow_warns():
),
patch("neural_lam.utils.logger") as mock_log,
):
setup_training_logger(datastore, args, run_name="my-run")
setup_training_logger(
datastore, args, run_name="my-run", run_dir="runs/my-run"
)

mock_log.warning.assert_called_once()
warning_msg = mock_log.warning.call_args[0][0]
Expand Down