Trouble with `dataset_benchmark` and `tensors_benchmark`. Continual Learning plugins do not work. 

🐛 **Describe the bug**
I prepare 2 sets of training and test Pytorch datasets from 2 different domains with the EWC plugin to perform domain-incremental CL. However, the evaluation results show no CL is performed. The evaluation results are the same as performing naive fine-tuning. This bug happens for both `dataset_benchmark` and `tensors_benchmark`. 

🐜 **To Reproduce**
Due to the confidentiality of my dataset, I cannot share the dataset here. Below is my working manuscript for your information for debugging. 
```
import torch
import argparse
import numpy as np
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn import Module, ReLU, Linear, Softmax
from avalanche.benchmarks.generators import dataset_benchmark, tensors_benchmark
from avalanche.benchmarks.utils import AvalancheDataset, make_classification_dataset
from avalanche.training.supervised import EWC
from avalanche.evaluation.metrics import (
     forgetting_metrics,
     accuracy_metrics,
     loss_metrics,
     bwt_metrics
)
from avalanche.logging import InteractiveLogger, TensorboardLogger
from avalanche.training.plugins import EvaluationPlugin


# Create torch dataset class
class dataSet(Dataset):
    def __init__(self, x, y):
        self.x = torch.as_tensor(x)
        self.targets = torch.as_tensor(y)

    def __len__(self):
        return self.x.shape[0]

    def __getitem__(self, index):
        return self.x[index, :], self.targets[index]


# Create MLP model
class MLP(Module):
    def __init__(self, in_num, out_num, hidden_num1, hidden_num2, hidden_num3):
        super(MLP, self).__init__()
        self.fc1 = Linear(in_features=in_num, out_features=hidden_num1)
        self.relu1 = ReLU()

        self.fc2 = Linear(in_features=hidden_num1, out_features=hidden_num2)
        self.relu2 = ReLU()

        self.fc3 = Linear(in_features=hidden_num2, out_features=hidden_num3)
        self.relu3 = ReLU()

        self.fc4 = Linear(in_features=hidden_num3, out_features=out_num)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)

        x = self.fc2(x)
        x = self.relu2(x)

        x = self.fc3(x)
        x = self.relu3(x)

        x = self.fc4(x)
        return x


def main(args):
    # --- CONFIG
    if args.dataset == "covid":
        path = "covid_dataset"
    elif args.dataset == "diab":
        path = "diabetes_dataset"

    in_num = 155        # the number of input features
    hidden_num1 = 256   # number of neurons in hidden layers
    hidden_num2 = 128   # number of neurons in hidden layers
    hidden_num3 = 128   # number of neurons in hidden layers
    out_num = 3         # number of classes

    # Prepare training and test datasets for both distributions
    x_train_1 = np.load(path + 'x_train_1.npy').astype(np.float32)
    x_test_1 = np.load(path + 'x_test_1.npy').astype(np.float32)
    x_train_2 = np.load(path + 'x_train_2.npy').astype(np.float32)
    x_test_2 = np.load(path + 'x_test_2.npy').astype(np.float32)
    
    # Transform the labels into one-hot encoding
    y_train_1 = np.load(path + 'y_train_1.npy').astype(np.int_)
    y_test_1 = np.load(path + 'y_test_1.npy').astype(np.int_)
    y_train_2 = np.load(path + 'y_train_2.npy').astype(np.int_)
    y_test_2 = np.load(path + 'y_test_2.npy').astype(np.int_)

    # Instantiate datasets 
    train_1 = dataSet(x_train_1, y_train_1)
    test_1 = dataSet(x_test_1, y_test_1)
    train_2 = dataSet(x_train_2, y_train_2)
    test_2 = dataSet(x_test_2, y_test_2)

    train_1 = make_classification_dataset(train_1, task_labels=0)
    test_1 = make_classification_dataset(test_1, task_labels=0)
    train_2 = make_classification_dataset(train_2, task_labels=0)
    test_2 = make_classification_dataset(test_2, task_labels=0)

    # check if selected GPU is available or use CPU
    assert args.cuda == -1 or args.cuda >= 0, "cuda must be -1 or >= 0."
    device = torch.device(
        f"cuda:{args.cuda}"
        if torch.cuda.is_available() and args.cuda >= 0
        else "cpu"
    )
    print(f"Using device: {device}")
    # ---------

    # --- SCENARIO CREATION
    # generic_scenario = tensors_benchmark(
    #     train_tensors=[(x_train_1, y_train_1), (x_train_2, y_train_2)], 
    #     test_tensors=[(x_test_1, y_test_1), (x_test_2, y_test_2)], 
    #     task_labels=[0, 1]
    # )
    generic_scenario = dataset_benchmark([train_1, train_2], [test_1, test_2])
    # ---------

    # MODEL CREATION
    model = MLP(in_num, out_num, hidden_num1, hidden_num2, hidden_num3).to(device)
    if args.optim == 'sgd':
        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
    elif args.optim == 'adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    criterion = torch.nn.CrossEntropyLoss()

    # DEFINE THE EVALUATION PLUGIN AND LOGGER
    interactive_logger = InteractiveLogger()

    eval_plugin = EvaluationPlugin(
        accuracy_metrics(
            minibatch=False, epoch=True, experience=True, stream=True
        ),
        loss_metrics(minibatch=False, epoch=True, experience=True, stream=True),
        forgetting_metrics(experience=True, stream=True),
        bwt_metrics(experience=True, stream=True),
        loggers=[interactive_logger]
    )

    if args.ewc_mode == 'separate':
        args.decay_factor = None

    # create strategy
    strategy = EWC(
        model,
        optimizer,
        criterion,
        args.ewc_lambda,
        args.ewc_mode,
        decay_factor=args.decay_factor,
        train_epochs=args.epochs,
        device=device,
        train_mb_size=args.minibatch_size,
        evaluator=eval_plugin,
    )

    # train on the selected scenario with the chosen strategy
    print("Starting experiment...")
    results = []
    acc_history = []
    for experience in generic_scenario.train_stream:
        print("Start training on experience ", experience.current_experience)

        strategy.train(experience)
        print("End training on experience", experience.current_experience)
        print("Computing accuracy on the test set")
        results.append(strategy.eval(generic_scenario.test_stream))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--dataset",
        type=str,
        choices=["covid", "diab"],
        default="covid",
        help="Choose between covid and diab.",
    )
    parser.add_argument(
        "--ewc_mode",
        type=str,
        choices=["separate", "online"],
        default="separate",
        help="Choose between EWC and online.",
    )
    parser.add_argument(
        "--ewc_lambda",
        type=float,
        default=0.4,
        help="Penalty hyperparameter for EWC",
    )
    parser.add_argument(
        "--decay_factor",
        type=float,
        default=0.1,
        help="Decay factor for importance " "when ewc_mode is online.",
    )
    parser.add_argument("--optim", type=str, choices=["sgd", "adam"], default="sgd", help="Optimizer.")
    parser.add_argument("--lr", type=float, default=1e-3, help="Learning rate.")
    parser.add_argument("--momentum", type=float, default=9e-1, help="Momentum.")
    parser.add_argument(
        "--epochs", type=int, default=300, help="Number of training epochs."
    )
    parser.add_argument(
        "--minibatch_size", type=int, default=128, help="Minibatch size."
    )
    parser.add_argument(
        "--cuda",
        type=int,
        default=0,
        help="Specify GPU id to use. Use CPU if -1.",
    )
    args = parser.parse_args()

    main(args)
```

🐝 **Expected behavior**
I expect the EWC plugin to perform domain-incremental CL with EWC on my self-defined datasets from 2 different domains with `dataset_benchmark`. 


🐞 **Screenshots**
Results from EWC:
![image](https://github.com/ContinualAI/avalanche/assets/73720056/e94de584-a4bb-4c60-a212-6e25748934c7)
Results from Naive Fine-tuning:
![image](https://github.com/ContinualAI/avalanche/assets/73720056/536f121c-8da3-4756-9c33-3ed88c985b76)
Results from my own CL strategy:
![image](https://github.com/ContinualAI/avalanche/assets/73720056/5ab283b9-169f-4a40-8348-2ca0eeb8b185)



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Trouble with `dataset_benchmark` and `tensors_benchmark`. Continual Learning plugins do not work. #1540

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Trouble with dataset_benchmark and tensors_benchmark. Continual Learning plugins do not work. #1540

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

Trouble with `dataset_benchmark` and `tensors_benchmark`. Continual Learning plugins do not work. #1540