jupyter · zmbc · Jan 31, 2026 · Feb 12, 2026
diff --git a/docs/source/config.rst b/docs/source/config.rst
@@ -34,6 +34,8 @@ The current output of `nbdime --config` is:
       Ignore: {}
       attachments: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       metadata: null
       outputs: null
@@ -45,6 +47,8 @@ The current output of `nbdime --config` is:
       base_url: "/"
       browser: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       ip: "127.0.0.1"
       metadata: null
@@ -58,6 +62,8 @@ The current output of `nbdime --config` is:
       Ignore: {}
       attachments: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       ignore_transients: true
       input_strategy: null
@@ -73,6 +79,8 @@ The current output of `nbdime --config` is:
       base_url: "/"
       browser: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       ignore_transients: true
       input_strategy: null
@@ -106,6 +114,8 @@ The current output of `nbdime --config` is:
       Ignore: {}
       attachments: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       metadata: null
       outputs: null
@@ -115,6 +125,8 @@ The current output of `nbdime --config` is:
       Ignore: {}
       attachments: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       metadata: null
       outputs: null
@@ -126,6 +138,8 @@ The current output of `nbdime --config` is:
       base_url: "/"
       browser: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       ip: "127.0.0.1"
       metadata: null
@@ -139,6 +153,8 @@ The current output of `nbdime --config` is:
       Ignore: {}
       attachments: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       ignore_transients: true
       input_strategy: null
@@ -154,6 +170,8 @@ The current output of `nbdime --config` is:
       base_url: "/"
       browser: null
       color_words: false
+      text_similarity_ignore_whitespace: true
+      text_similarity_threshold: 0.3
       details: null
       ignore_transients: true
       input_strategy: null

diff --git a/nbdime/args.py b/nbdime/args.py
@@ -14,6 +14,7 @@
     get_defaults_for_argparse, build_config, entrypoint_configurables,
     Namespace
 )
+from .diffing.generic import set_text_similarity_options
 from .diffing.notebooks import set_notebook_diff_targets, set_notebook_diff_ignores
 from .gitfiles import is_gitref
 from .ignorables import diff_ignorables
@@ -338,6 +339,25 @@ def add_diff_args(parser):
         action=IgnorableAction,
         help="process/ignore details not covered by other options.")
 
+    similarity = parser.add_argument_group(
+        title='similarity',
+        description='Control how text similarity is estimated when aligning cells.')
+    similarity.add_argument(
+        '--text-similarity-threshold',
+        dest='text_similarity_threshold',
+        metavar='RATIO',
+        type=float,
+        default=0.3,
+        help='minimum ratio (0-1) required to consider two text blocks similar',
+    )
+    similarity.add_argument(
+        '--no-text-similarity-ignore-whitespace',
+        dest='text_similarity_ignore_whitespace',
+        action='store_false',
+        default=True,
+        help='do not drop whitespace-only lines before computing similarity',
+    )
+
 
 def add_diff_cli_args(parser):
     """Adds a set of arguments for CLI diff commands (i.e. not web).
@@ -405,6 +425,11 @@ def process_diff_flags(args):
             args.sources, args.outputs, args.attachments, args.metadata,
             args.id, args.details)
 
+    set_text_similarity_options(
+        threshold=getattr(args, 'text_similarity_threshold', None),
+        ignore_whitespace_lines=getattr(args, 'text_similarity_ignore_whitespace', None),
+    )
+
 
 def resolve_diff_args(args):
     """Resolve ambiguity of path vs base/remote for git:

diff --git a/nbdime/config.py b/nbdime/config.py
@@ -3,7 +3,7 @@
 
 from jupyter_core.paths import jupyter_config_path
 
-from traitlets import Unicode, Enum, Integer, Bool, HasTraits, Dict, TraitError
+from traitlets import Unicode, Enum, Integer, Bool, Float, HasTraits, Dict, TraitError
 from traitlets.config.loader import JSONFileConfigLoader, ConfigFileNotFound
 
 from .merging.notebooks import (
@@ -221,6 +221,19 @@ class _Diffing(_Ignorables):
               "to git diff"),
     ).tag(config=True)
 
+    text_similarity_threshold = Float(
+        0.3,
+        min=0.0,
+        max=1.0,
+        help=("minimum ratio (0-1) for considering two text blocks similar "
+              "when aligning cells and outputs"),
+    ).tag(config=True)
+
+    text_similarity_ignore_whitespace = Bool(
+        True,
+        help=("ignore whitespace-only lines when estimating text similarity"),
+    ).tag(config=True)
+
 
 class Diff(_Diffing):
     pass

diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py
@@ -6,6 +6,7 @@
 import operator
 from collections import defaultdict
 import difflib
+from typing import Optional, Union
 
 from ..diff_format import SequenceDiffBuilder, MappingDiffBuilder, validate_diff
 from ..diff_utils import count_consumed_symbols
@@ -17,6 +18,41 @@
 __all__ = ["diff"]
 
 
+_text_similarity_settings = {
+    "threshold": 0.3,
+    "ignore_whitespace_lines": True,
+}
+
+
+def set_text_similarity_options(threshold: Optional[Union[int, float]]=None, ignore_whitespace_lines: Optional[bool]=None) -> None:
+    """Configure defaults for approximate string comparisons.
+
+    Parameters
+    ----------
+    threshold: float, optional
+        Minimum difflib ratio (0-1) used to consider strings similar when no
+        explicit threshold is provided.
+    ignore_whitespace_lines: bool, optional
+        Whether to drop whitespace-only lines before computing similarity.
+    """
+
+    if threshold is not None:
+        if not isinstance(threshold, (int, float)):
+            raise TypeError("text similarity threshold must be a number")
+        if not (0.0 <= threshold <= 1.0):
+            raise ValueError("text similarity threshold must be between 0 and 1")
+        _text_similarity_settings["threshold"] = float(threshold)
+
+    if ignore_whitespace_lines is not None:
+        _text_similarity_settings["ignore_whitespace_lines"] = bool(ignore_whitespace_lines)
+
+
+def get_text_similarity_options() -> dict:
+    """Return a copy of the current similarity defaults."""
+
+    return _text_similarity_settings.copy()
+
+
 def default_predicates():
     return defaultdict(lambda: (operator.__eq__,))
 
@@ -25,10 +61,8 @@ def default_differs():
     return defaultdict(lambda: diff)
 
 
-def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
-    "Compare to strings with approximate heuristics."
-    # TODO: Add configuration framework
-    # TODO: Tune threshold with realistic sources
+def compare_strings_approximate(x: str, y: str, threshold: float=0.7, maxlen: Optional[int]=None, min_divergence_to_be_unsimilar: Optional[int]=None, min_match_length_to_be_similar: Optional[int]=None):
+    "Compare two strings with approximate heuristics."
 
     # Fast cutoff when one is empty
     if bool(x) != bool(y):
@@ -38,6 +72,12 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
     # and lists of strings also works fine
     if len(x) == len(y) and x == y:
         return True
+
+    if min_divergence_to_be_unsimilar is not None and len(x) <= min_divergence_to_be_unsimilar and len(y) <= min_divergence_to_be_unsimilar:
+        return True
+
+    if min_match_length_to_be_similar is not None and (len(x) < min_match_length_to_be_similar or len(y) < min_match_length_to_be_similar):
+        return False
 
     # TODO: Investigate performance and quality of this difflib ratio approach,
     # possibly one of the weakest links of the notebook diffing algorithm.
@@ -57,6 +97,9 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
     # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False)
     s = difflib.SequenceMatcher(None, x, y, autojunk=False)
 
+    if min_divergence_to_be_unsimilar is not None:
+        threshold = max(threshold, min_divergence_to_be_unsimilar / max(len(x), len(y)))
+
     # Use only the fast ratio approximations first
     if s.real_quick_ratio() < threshold:
         return False
@@ -67,7 +110,14 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
         # We know from above that there is not an exact similarity
         return False
 
-    return s.ratio() > threshold
+    if not s.ratio() > threshold:
+        return False
+
+    if min_match_length_to_be_similar is not None:
+        longest = max((m.size for m in s.get_matching_blocks()), default=0)
+        return longest >= min_match_length_to_be_similar
+    else:
+        return True
 
 
 def diff(a, b, path="", config=None):

diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py
@@ -14,14 +14,15 @@
 import re
 import copy
 from functools import lru_cache
+from typing import Union
 
 from ..diff_format import MappingDiffBuilder, DiffOp
 from ..utils import defaultdict2
 
 from .config import DiffConfig
 from .generic import (
     diff, diff_sequence_multilevel, compare_strings_approximate,
-    diff_string_lines,
+    diff_string_lines, get_text_similarity_options,
 )
 
 __all__ = ["diff_notebooks"]
@@ -37,6 +38,7 @@
 
 TEXT_MIMEDATA_MAX_COMPARE_LENGTH = 10000
 STREAM_MAX_COMPARE_LENGTH = 1000
+MIN_MATCH_LENGTH = 5
 
 
 # List of mimes we can diff recursively
@@ -55,40 +57,39 @@ def _is_base64(test_string, min_len=64):
     return _base64.match(''.join(test_string.splitlines()))
 
 
-# TODO: Maybe cleaner to make the split between strict/approximate
-#       an argument instead of separate functions.
+def _prepare_text_for_similarity(value: Union[str, list[str]], ignore_whitespace_lines: bool) -> str:
+    """Normalize text for approximate comparison, stripping ignorable parts"""
+    if isinstance(value, list):
+        value = "".join(value)
+    if ignore_whitespace_lines:
+        lines = value.splitlines(True)
+        value = "".join(line for line in lines if line.strip())
+    return value
 
 
 @lru_cache(maxsize=1024, typed=False)
 def compare_text_approximate(x, y, maxlen=None):
-    # Fast cutoff when one is empty
-    if bool(x) != bool(y):
-        return False
+    settings = get_text_similarity_options()
 
-    if isinstance(x, list):
-        x = "".join(x)
-    if isinstance(y, list):
-        y = "".join(y)
-
-    # TODO: Review whether this is wanted.
-    #       The motivation is to align tiny
-    #       strings in outputs such as a single number.
-    # Allow aligning short strings without comparison
-    nx = len(x)
-    ny = len(y)
-    shortlen = 10  # TODO: Add this to configuration framework
-    if nx < shortlen and ny < shortlen:
-        return True
+    x_norm = _prepare_text_for_similarity(x, settings["ignore_whitespace_lines"])
+    y_norm = _prepare_text_for_similarity(y, settings["ignore_whitespace_lines"])
 
-    return compare_strings_approximate(x, y, threshold=0.7, maxlen=maxlen)
+    max_len = max(len(x_norm), len(y_norm))
+    min_match_length = min(MIN_MATCH_LENGTH, max_len - 1)
+
+    return compare_strings_approximate(
+        x, y,
+        threshold=settings["threshold"],
+        min_divergence_to_be_unsimilar=10,
+        min_match_length_to_be_similar=min_match_length,
+        maxlen=maxlen,
+    )
 
 
 def compare_text_strict(x, y, maxlen=None):
     # TODO: Doesn't have to be 100% equal here?
-    if isinstance(x, list):
-        x = "".join(x)
-    if isinstance(y, list):
-        y = "".join(y)
+    x = _prepare_text_for_similarity(x, False)
+    y = _prepare_text_for_similarity(y, False)
     if len(x) == len(y) and x == y:
         return True
     return compare_strings_approximate(x, y, threshold=0.95, maxlen=maxlen)

diff --git a/nbdime/tests/conftest.py b/nbdime/tests/conftest.py
@@ -339,7 +339,7 @@ def __getitem__(self, name):
 _db = NBTestDataBase()
 
 
-def _any_nb_name():
+def _any_nb_names():
     return _db.names
 
 
@@ -370,30 +370,36 @@ def _matching_nb_triplet_names():
                     triplets.append((basename, names[i], names[j]))
     return triplets
 
+def _params_and_ids_from_names(names):
+    return {
+        'params': names,
+        'ids': [n if isinstance(n, str) else "__".join(n) for n in names]
+    }
+
 
 @fixture
 def db():
     return _db
 
 
-@fixture(params=_any_nb_name())
+@fixture(**_params_and_ids_from_names(_any_nb_names()))
 def any_nb(request):
     return _db[request.param]
 
 
-@fixture(params=_any_nb_pair_names())
+@fixture(**_params_and_ids_from_names(_any_nb_pair_names()))
 def any_nb_pair(request):
     a, b = request.param
     return _db[a], _db[b]
 
 
-@fixture(params=_matching_nb_pair_names())
+@fixture(**_params_and_ids_from_names(_matching_nb_pair_names()))
 def matching_nb_pairs(request):
     a, b = request.param
     return _db[a], _db[b]
 
 
-@fixture(params=_matching_nb_triplet_names())
+@fixture(**_params_and_ids_from_names(_matching_nb_triplet_names()))
 def matching_nb_triplets(request):
     a, b, c = request.param
     print(a, b, c)