EveryVoiceTTS · joanise · Mar 24, 2026 · Mar 18, 2026 · Mar 18, 2026 · Mar 18, 2026
diff --git a/README.md b/README.md
@@ -149,6 +149,7 @@ Many thanks to:
 ## Tests
 
 There are many ways to run the unit tests, if you installed EveryVoice from source:
+ - Run all the tests with the most concise output: `pytest`
  - Run all the dev tests: `everyvoice/run_tests.py dev` or `everyvoice test dev`
  - Run the tests with verbose logs: `everyvoice/run_tests.py --verbose dev`
  - Show the names of the other suites you can run: `everyvoice/run_tests.py -h`

diff --git a/everyvoice/.schema/everyvoice-shared-data-0.4.json b/everyvoice/.schema/everyvoice-shared-data-0.4.json
@@ -146,6 +146,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The file-loader function to use to load your dataset's filelist.",
           "title": "Filelist Loader",
           "type": "string"

diff --git a/everyvoice/.schema/everyvoice-shared-text-0.4.json b/everyvoice/.schema/everyvoice-shared-text-0.4.json
@@ -215,8 +215,13 @@
       "type": "object"
     },
     "cleaners": {
+      "default": [
+        "everyvoice.utils.collapse_whitespace",
+        "everyvoice.utils.strip_text"
+      ],
       "description": "List of cleaners to apply to all datasets and run-time data. Superceded by language_cleaners when processing text in a language which has language-specific cleaners, which are in turn superceded by dataset_cleaners when processing a dataset which has dataset-specific cleaners.",
       "items": {
+        "default": "everyvoice.utils.generic_psv_filelist_reader",
         "type": "string"
       },
       "title": "Global cleaners",
@@ -225,6 +230,7 @@
     "language_cleaners": {
       "additionalProperties": {
         "items": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "type": "string"
         },
         "type": "array"
@@ -237,6 +243,7 @@
     "dataset_cleaners": {
       "additionalProperties": {
         "items": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "type": "string"
         },
         "type": "array"

diff --git a/everyvoice/.schema/everyvoice-spec-to-wav-0.4.json b/everyvoice/.schema/everyvoice-spec-to-wav-0.4.json
@@ -269,6 +269,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The file-loader function to use to load your dataset's filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -378,6 +379,7 @@
           "type": "array"
         },
         "activation_function": {
+          "default": "everyvoice.utils.original_hifigan_leaky_relu",
           "description": "The activation function to use.",
           "title": "Activation Function",
           "type": "string"
@@ -543,6 +545,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The function to use to load the filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -638,6 +641,7 @@
           "type": "string"
         },
         "sub_dir_callable": {
+          "default": "everyvoice.utils.get_current_time",
           "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be <name> / <version> / <sub_dir> where <sub_dir> is a timestamp.",
           "title": "Sub Dir Callable",
           "type": "string"

diff --git a/everyvoice/.schema/everyvoice-text-to-spec-0.4.json b/everyvoice/.schema/everyvoice-text-to-spec-0.4.json
@@ -213,6 +213,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The file-loader function to use to load your dataset's filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -432,6 +433,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The function to use to load the filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -584,6 +586,7 @@
           "type": "string"
         },
         "sub_dir_callable": {
+          "default": "everyvoice.utils.get_current_time",
           "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be <name> / <version> / <sub_dir> where <sub_dir> is a timestamp.",
           "title": "Sub Dir Callable",
           "type": "string"
@@ -925,8 +928,13 @@
           "type": "object"
         },
         "cleaners": {
+          "default": [
+            "everyvoice.utils.collapse_whitespace",
+            "everyvoice.utils.strip_text"
+          ],
           "description": "List of cleaners to apply to all datasets and run-time data. Superceded by language_cleaners when processing text in a language which has language-specific cleaners, which are in turn superceded by dataset_cleaners when processing a dataset which has dataset-specific cleaners.",
           "items": {
+            "default": "everyvoice.utils.generic_psv_filelist_reader",
             "type": "string"
           },
           "title": "Global cleaners",
@@ -935,6 +943,7 @@
         "language_cleaners": {
           "additionalProperties": {
             "items": {
+              "default": "everyvoice.utils.generic_psv_filelist_reader",
               "type": "string"
             },
             "type": "array"
@@ -947,6 +956,7 @@
         "dataset_cleaners": {
           "additionalProperties": {
             "items": {
+              "default": "everyvoice.utils.generic_psv_filelist_reader",
               "type": "string"
             },
             "type": "array"

diff --git a/everyvoice/.schema/everyvoice-text-to-wav-0.4.json b/everyvoice/.schema/everyvoice-text-to-wav-0.4.json
@@ -313,6 +313,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The file-loader function to use to load your dataset's filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -447,6 +448,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The function to use to load the filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -812,6 +814,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The function to use to load the filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -1218,6 +1221,7 @@
           "type": "array"
         },
         "activation_function": {
+          "default": "everyvoice.utils.original_hifigan_leaky_relu",
           "description": "The activation function to use.",
           "title": "Activation Function",
           "type": "string"
@@ -1383,6 +1387,7 @@
           "type": "string"
         },
         "filelist_loader": {
+          "default": "everyvoice.utils.generic_psv_filelist_reader",
           "description": "Advanced. The function to use to load the filelist.",
           "title": "Filelist Loader",
           "type": "string"
@@ -1496,6 +1501,7 @@
           "type": "string"
         },
         "sub_dir_callable": {
+          "default": "everyvoice.utils.get_current_time",
           "description": "The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be <name> / <version> / <sub_dir> where <sub_dir> is a timestamp.",
           "title": "Sub Dir Callable",
           "type": "string"
@@ -1874,8 +1880,13 @@
           "type": "object"
         },
         "cleaners": {
+          "default": [
+            "everyvoice.utils.collapse_whitespace",
+            "everyvoice.utils.strip_text"
+          ],
           "description": "List of cleaners to apply to all datasets and run-time data. Superceded by language_cleaners when processing text in a language which has language-specific cleaners, which are in turn superceded by dataset_cleaners when processing a dataset which has dataset-specific cleaners.",
           "items": {
+            "default": "everyvoice.utils.generic_psv_filelist_reader",
             "type": "string"
           },
           "title": "Global cleaners",
@@ -1884,6 +1895,7 @@
         "language_cleaners": {
           "additionalProperties": {
             "items": {
+              "default": "everyvoice.utils.generic_psv_filelist_reader",
               "type": "string"
             },
             "type": "array"
@@ -1896,6 +1908,7 @@
         "dataset_cleaners": {
           "additionalProperties": {
             "items": {
+              "default": "everyvoice.utils.generic_psv_filelist_reader",
               "type": "string"
             },
             "type": "array"

diff --git a/everyvoice/config/preprocessing_config.py b/everyvoice/config/preprocessing_config.py
@@ -12,10 +12,7 @@
     PossiblySerializedCallable,
     load_partials,
 )
-from everyvoice.utils import (
-    generic_psv_filelist_reader,
-    load_config_from_json_or_yaml_path,
-)
+from everyvoice.utils import load_config_from_json_or_yaml_path
 
 
 class AudioSpecTypeEnum(str, Enum):
@@ -104,15 +101,18 @@ class Dataset(PartialLoadConfig):
         validate_default=True,
     )
     data_dir: PossiblyRelativePath = Field(
-        default="/please/create/a/path/to/your/dataset/data",  # type: ignore[assignment]
+        default="/please/create/a/path/to/your/dataset/data",
+        validate_default=True,
         description="The path to the directory with your audio files.",
     )
     filelist: PossiblyRelativePath = Field(
-        default="/please/create/a/path/to/your/dataset/filelist",  # type: ignore[assignment]
+        default="/please/create/a/path/to/your/dataset/filelist",
+        validate_default=True,
         description="The path to your dataset's filelist.",
     )
     filelist_loader: PossiblySerializedCallable = Field(
-        default=generic_psv_filelist_reader,
+        default="everyvoice.utils.generic_psv_filelist_reader",
+        validate_default=True,
         description="Advanced. The file-loader function to use to load your dataset's filelist.",
     )
     sox_effects: list = Field(
@@ -141,6 +141,7 @@ class PreprocessingConfig(PartialLoadConfig):
     )
     save_dir: PossiblyRelativePathMustExist = Field(
         default="preprocessed/YourDataSet",  # type: ignore[assignment]
+        validate_default=False,  # don't set to True, it causes spurious dir creation
         description="The directory to save preprocessed files to.",
     )
     audio: AudioConfig = Field(

diff --git a/everyvoice/config/shared_types.py b/everyvoice/config/shared_types.py
@@ -25,7 +25,6 @@
     PossiblyRelativePathMustExist,
     PossiblySerializedCallable,
 )
-from everyvoice.utils import generic_psv_filelist_reader, get_current_time
 
 _init_context_var = ContextVar("_init_context_var", default=None)
 
@@ -158,13 +157,14 @@ class LoggerConfig(PartialLoadConfig):
     )
 
     save_dir: PossiblyRelativePathMustExist = Field(
-        # default for Paths must be str for correctly generating schemas on all platforms
         default="logs_and_checkpoints",  # type: ignore[assignment]
+        validate_default=False,  # keep False: True causes dir creation when we don't want it
         description="The directory to save your checkpoints and logs to.",
     )
 
     sub_dir_callable: PossiblySerializedCallable = Field(
-        default=get_current_time,
+        default="everyvoice.utils.get_current_time",
+        validate_default=True,
         description="The function that generates a string to call your runs - by default this is a timestamp. The structure of your logs will be <name> / <version> / <sub_dir> where <sub_dir> is a timestamp.",
     )
 
@@ -219,15 +219,18 @@ class BaseTrainingConfig(PartialLoadConfig):
         description="Automatically resume training from a checkpoint loaded from this path.",
     )
     training_filelist: PossiblyRelativePath = Field(
-        default="path/to/your/preprocessed/training_filelist.psv",  # type: ignore[assignment]
+        default="path/to/your/preprocessed/training_filelist.psv",
+        validate_default=True,
         description="The path to a filelist containing samples belonging to your training set.",
     )
     validation_filelist: PossiblyRelativePath = Field(
-        default="path/to/your/preprocessed/validation_filelist.psv",  # type: ignore[assignment]
+        default="path/to/your/preprocessed/validation_filelist.psv",
+        validate_default=True,
         description="The path to a filelist containing samples belonging to your validation set.",
     )
     filelist_loader: PossiblySerializedCallable = Field(
-        default=generic_psv_filelist_reader,
+        default="everyvoice.utils.generic_psv_filelist_reader",
+        validate_default=True,
         description="Advanced. The function to use to load the filelist.",
     )
     logger: LoggerConfig = Field(

diff --git a/everyvoice/config/text_config.py b/everyvoice/config/text_config.py
@@ -8,13 +8,10 @@
 
 from everyvoice.config.shared_types import ConfigModel, init_context
 from everyvoice.config.utils import PossiblySerializedCallable
+from everyvoice.config.validation_helpers import string_to_callable
 from everyvoice.text.phonemizer import G2PCallable
 from everyvoice.text.utils import normalize_text_helper
-from everyvoice.utils import (
-    collapse_whitespace,
-    load_config_from_json_or_yaml_path,
-    strip_text,
-)
+from everyvoice.utils import load_config_from_json_or_yaml_path
 
 
 class Punctuation(BaseModel):
@@ -198,7 +195,13 @@ def load_custom_g2p_engine(lang_id: str, qualified_g2p_func_name: str) -> G2PCal
     return validate_g2p_engine_signature(getattr(module, function_name))
 
 
-DEFAULT_CLEANERS: list[PossiblySerializedCallable] = [collapse_whitespace, strip_text]
+DEFAULT_CLEANERS_S = [
+    "everyvoice.utils.collapse_whitespace",
+    "everyvoice.utils.strip_text",
+]
+DEFAULT_CLEANERS: list[PossiblySerializedCallable] = [
+    string_to_callable(cleaner) for cleaner in DEFAULT_CLEANERS_S
+]
 
 
 class TextConfig(ConfigModel):
@@ -219,7 +222,8 @@ class TextConfig(ConfigModel):
         description="Map from dataset label to replacement maps. Supercedes both the global text replacements and language_to_replace when defined for a given dataset.",
     )
     cleaners: list[PossiblySerializedCallable] = Field(
-        default=DEFAULT_CLEANERS,
+        default=DEFAULT_CLEANERS_S,
+        validate_default=True,
         title="Global cleaners",
         description="List of cleaners to apply to all datasets and run-time data. Superceded by language_cleaners when processing text in a language which has language-specific cleaners, which are in turn superceded by dataset_cleaners when processing a dataset which has dataset-specific cleaners.",
     )

diff --git a/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning b/everyvoice/model/vocoder/HiFiGAN_iSTFT_lightning
diff --git a/everyvoice/tests/regression/prep-datasets.sh b/everyvoice/tests/regression/prep-datasets.sh
@@ -10,6 +10,7 @@ SGILE_DATASET_ROOT=${SGILE_DATASET_ROOT:-$HOME/sgile/data}
 
 DURATIONS="20 60 180 full"
 
+# Source: https://keithito.com/LJ-Speech-Dataset
 LJ_SPEECH_DATASET=$SGILE_DATASET_ROOT/LJSpeech-1.1
 
 for duration in $DURATIONS; do
@@ -35,6 +36,7 @@ cp "$EVERYVOICE_REGRESS_ROOT"/run-demo-app-lj-full.sh regress-lj-full/run-demo-a
 cp "$EVERYVOICE_REGRESS_ROOT"/test-demo-app-lj-full.py regress-lj-full/test-demo-app.py
 cp "$EVERYVOICE_REGRESS_ROOT"/wait-for-demo-app.py "$dir"/wait-for-demo-app.py
 
+# Source: https://openslr.org/30
 SinhalaTTS=$SGILE_DATASET_ROOT/SinhalaTTS
 for duration in $DURATIONS; do
     dir=regress-si-$duration
@@ -59,6 +61,7 @@ for duration in $DURATIONS; do
     echo "අක-ෂර" > "$dir"/test2.txt
 done
 
+# Source: https://openslr.org/32
 isiXhosa=$SGILE_DATASET_ROOT/OpenSLR32-four-South-Afican-languages/xh_za/za/xho
 for duration in $DURATIONS; do
     dir=regress-xh-$duration

diff --git a/everyvoice/tests/stubs.py b/everyvoice/tests/stubs.py
@@ -366,9 +366,10 @@ def silence_c_stdout():
     address our narrow needs, namely to silence stdout in a context manager.
 
     Warning: disabled on Windows, where it causes silent crashes.
+    Warning: not compatible with pytest
     """
 
-    if not VERBOSE_OVERRIDE and os.name != "nt":
+    if not VERBOSE_OVERRIDE and os.name != "nt" and "pytest" not in sys.modules:
         stdout_fileno = sys.stdout.fileno()
         stdout_save = os.dup(stdout_fileno)
         stdout_fd = os.open(os.devnull, os.O_RDWR)
@@ -394,9 +395,10 @@ def silence_c_stderr():
     address our narrow needs, namely to silence stderr in a context manager.
 
     Warning: disabled on Windows, where it causes silent crashes.
+    Warning: not compatible with pytest
     """
 
-    if not VERBOSE_OVERRIDE and os.name != "nt":
+    if not VERBOSE_OVERRIDE and os.name != "nt" and "pytest" not in sys.modules:
         stderr_fileno = sys.stderr.fileno()
         stderr_save = os.dup(stderr_fileno)
         stderr_fd = os.open(os.devnull, os.O_RDWR)