From 1540e92811bfe0e1d89d2837c59d8f2afa95251a Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Fri, 30 Jan 2026 20:24:59 -0800 Subject: [PATCH 1/2] Allow configuration of text similarity threshold and improve default --- docs/source/config.rst | 18 +++++++ nbdime/args.py | 25 +++++++++ nbdime/config.py | 15 +++++- nbdime/diffing/generic.py | 57 ++++++++++++++++++-- nbdime/diffing/notebooks.py | 55 +++++++++---------- nbdime/tests/conftest.py | 16 ++++-- nbdime/tests/test_cli_apps.py | 5 +- nbdime/tests/test_text_similarity.py | 81 ++++++++++++++++++++++++++++ 8 files changed, 231 insertions(+), 41 deletions(-) create mode 100644 nbdime/tests/test_text_similarity.py diff --git a/docs/source/config.rst b/docs/source/config.rst index c276cdd0c..be704f55d 100644 --- a/docs/source/config.rst +++ b/docs/source/config.rst @@ -34,6 +34,8 @@ The current output of `nbdime --config` is: Ignore: {} attachments: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null metadata: null outputs: null @@ -45,6 +47,8 @@ The current output of `nbdime --config` is: base_url: "/" browser: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null ip: "127.0.0.1" metadata: null @@ -58,6 +62,8 @@ The current output of `nbdime --config` is: Ignore: {} attachments: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null ignore_transients: true input_strategy: null @@ -73,6 +79,8 @@ The current output of `nbdime --config` is: base_url: "/" browser: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null ignore_transients: true input_strategy: null @@ -106,6 +114,8 @@ The current output of `nbdime --config` is: Ignore: {} attachments: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null metadata: null outputs: null @@ -115,6 +125,8 @@ The current output of `nbdime --config` is: Ignore: {} attachments: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null metadata: null outputs: null @@ -126,6 +138,8 @@ The current output of `nbdime --config` is: base_url: "/" browser: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null ip: "127.0.0.1" metadata: null @@ -139,6 +153,8 @@ The current output of `nbdime --config` is: Ignore: {} attachments: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null ignore_transients: true input_strategy: null @@ -154,6 +170,8 @@ The current output of `nbdime --config` is: base_url: "/" browser: null color_words: false + text_similarity_ignore_whitespace: true + text_similarity_threshold: 0.3 details: null ignore_transients: true input_strategy: null diff --git a/nbdime/args.py b/nbdime/args.py index fcd909686..fb5e0c7c8 100644 --- a/nbdime/args.py +++ b/nbdime/args.py @@ -14,6 +14,7 @@ get_defaults_for_argparse, build_config, entrypoint_configurables, Namespace ) +from .diffing.generic import set_text_similarity_options from .diffing.notebooks import set_notebook_diff_targets, set_notebook_diff_ignores from .gitfiles import is_gitref from .ignorables import diff_ignorables @@ -338,6 +339,25 @@ def add_diff_args(parser): action=IgnorableAction, help="process/ignore details not covered by other options.") + similarity = parser.add_argument_group( + title='similarity', + description='Control how text similarity is estimated when aligning cells.') + similarity.add_argument( + '--text-similarity-threshold', + dest='text_similarity_threshold', + metavar='RATIO', + type=float, + default=0.3, + help='minimum ratio (0-1) required to consider two text blocks similar', + ) + similarity.add_argument( + '--no-text-similarity-ignore-whitespace', + dest='text_similarity_ignore_whitespace', + action='store_false', + default=True, + help='do not drop whitespace-only lines before computing similarity', + ) + def add_diff_cli_args(parser): """Adds a set of arguments for CLI diff commands (i.e. not web). @@ -405,6 +425,11 @@ def process_diff_flags(args): args.sources, args.outputs, args.attachments, args.metadata, args.id, args.details) + set_text_similarity_options( + threshold=getattr(args, 'text_similarity_threshold', None), + ignore_whitespace_lines=getattr(args, 'text_similarity_ignore_whitespace', None), + ) + def resolve_diff_args(args): """Resolve ambiguity of path vs base/remote for git: diff --git a/nbdime/config.py b/nbdime/config.py index 2088de77c..56ef1f5ca 100644 --- a/nbdime/config.py +++ b/nbdime/config.py @@ -3,7 +3,7 @@ from jupyter_core.paths import jupyter_config_path -from traitlets import Unicode, Enum, Integer, Bool, HasTraits, Dict, TraitError +from traitlets import Unicode, Enum, Integer, Bool, Float, HasTraits, Dict, TraitError from traitlets.config.loader import JSONFileConfigLoader, ConfigFileNotFound from .merging.notebooks import ( @@ -221,6 +221,19 @@ class _Diffing(_Ignorables): "to git diff"), ).tag(config=True) + text_similarity_threshold = Float( + 0.3, + min=0.0, + max=1.0, + help=("minimum ratio (0-1) for considering two text blocks similar " + "when aligning cells and outputs"), + ).tag(config=True) + + text_similarity_ignore_whitespace = Bool( + True, + help=("ignore whitespace-only lines when estimating text similarity"), + ).tag(config=True) + class Diff(_Diffing): pass diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py index a0d77f7ba..5ba226bca 100644 --- a/nbdime/diffing/generic.py +++ b/nbdime/diffing/generic.py @@ -17,6 +17,39 @@ __all__ = ["diff"] +_text_similarity_settings = { + "threshold": 0.3, + "ignore_whitespace_lines": True, +} + + +def set_text_similarity_options(*, threshold=None, ignore_whitespace_lines=None): + """Configure defaults for approximate string comparisons. + + Parameters + ---------- + threshold: float, optional + Minimum difflib ratio (0-1) used to consider strings similar when no + explicit threshold is provided. + ignore_whitespace_lines: bool, optional + Whether to drop whitespace-only lines before computing similarity. + """ + + if threshold is not None: + if not (0.0 <= threshold <= 1.0): + raise ValueError("text similarity threshold must be between 0 and 1") + _text_similarity_settings["threshold"] = float(threshold) + + if ignore_whitespace_lines is not None: + _text_similarity_settings["ignore_whitespace_lines"] = bool(ignore_whitespace_lines) + + +def get_text_similarity_options(): + """Return a copy of the current similarity defaults.""" + + return _text_similarity_settings.copy() + + def default_predicates(): return defaultdict(lambda: (operator.__eq__,)) @@ -25,10 +58,8 @@ def default_differs(): return defaultdict(lambda: diff) -def compare_strings_approximate(x, y, threshold=0.7, maxlen=None): - "Compare to strings with approximate heuristics." - # TODO: Add configuration framework - # TODO: Tune threshold with realistic sources +def compare_strings_approximate(x, y, threshold=0.7, min_divergence_to_be_unsimilar=None, min_match_length_to_be_similar=None, maxlen=None): + "Compare two strings with approximate heuristics." # Fast cutoff when one is empty if bool(x) != bool(y): @@ -38,6 +69,12 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None): # and lists of strings also works fine if len(x) == len(y) and x == y: return True + + if min_divergence_to_be_unsimilar is not None and len(x) <= min_divergence_to_be_unsimilar and len(y) <= min_divergence_to_be_unsimilar: + return True + + if min_match_length_to_be_similar is not None and (len(x) < min_match_length_to_be_similar or len(y) < min_match_length_to_be_similar): + return False # TODO: Investigate performance and quality of this difflib ratio approach, # possibly one of the weakest links of the notebook diffing algorithm. @@ -57,6 +94,9 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None): # s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False) s = difflib.SequenceMatcher(None, x, y, autojunk=False) + if min_divergence_to_be_unsimilar is not None: + threshold = max(threshold, min_divergence_to_be_unsimilar / max(len(x), len(y))) + # Use only the fast ratio approximations first if s.real_quick_ratio() < threshold: return False @@ -67,7 +107,14 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None): # We know from above that there is not an exact similarity return False - return s.ratio() > threshold + if not s.ratio() > threshold: + return False + + if min_match_length_to_be_similar is not None: + longest = max((m.size for m in s.get_matching_blocks()), default=0) + return longest >= min_match_length_to_be_similar + else: + return True def diff(a, b, path="", config=None): diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 30a86fd70..223df8cfb 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -21,7 +21,7 @@ from .config import DiffConfig from .generic import ( diff, diff_sequence_multilevel, compare_strings_approximate, - diff_string_lines, + diff_string_lines, get_text_similarity_options, ) __all__ = ["diff_notebooks"] @@ -37,6 +37,7 @@ TEXT_MIMEDATA_MAX_COMPARE_LENGTH = 10000 STREAM_MAX_COMPARE_LENGTH = 1000 +MIN_MATCH_LENGTH = 5 # List of mimes we can diff recursively @@ -55,40 +56,40 @@ def _is_base64(test_string, min_len=64): return _base64.match(''.join(test_string.splitlines())) -# TODO: Maybe cleaner to make the split between strict/approximate -# an argument instead of separate functions. +def _prepare_text_for_similarity(value, ignore_whitespace_lines): + if isinstance(value, list): + value = "".join(value) + if ignore_whitespace_lines: + lines = value.splitlines(True) + value = "".join(line for line in lines if line.strip()) + return value @lru_cache(maxsize=1024, typed=False) def compare_text_approximate(x, y, maxlen=None): - # Fast cutoff when one is empty - if bool(x) != bool(y): - return False - - if isinstance(x, list): - x = "".join(x) - if isinstance(y, list): - y = "".join(y) - - # TODO: Review whether this is wanted. - # The motivation is to align tiny - # strings in outputs such as a single number. - # Allow aligning short strings without comparison - nx = len(x) - ny = len(y) - shortlen = 10 # TODO: Add this to configuration framework - if nx < shortlen and ny < shortlen: - return True - - return compare_strings_approximate(x, y, threshold=0.7, maxlen=maxlen) + settings = get_text_similarity_options() + threshold = settings["threshold"] + ignore_whitespace_lines = settings["ignore_whitespace_lines"] + + x_norm = _prepare_text_for_similarity(x, ignore_whitespace_lines) + y_norm = _prepare_text_for_similarity(y, ignore_whitespace_lines) + + max_len = max(len(x_norm), len(y_norm)) + min_match_length = min(MIN_MATCH_LENGTH, max_len - 1) + + return compare_strings_approximate( + x, y, + threshold=threshold, + min_divergence_to_be_unsimilar=10, + min_match_length_to_be_similar=min_match_length, + maxlen=maxlen, + ) def compare_text_strict(x, y, maxlen=None): # TODO: Doesn't have to be 100% equal here? - if isinstance(x, list): - x = "".join(x) - if isinstance(y, list): - y = "".join(y) + x = _prepare_text_for_similarity(x, False) + y = _prepare_text_for_similarity(y, False) if len(x) == len(y) and x == y: return True return compare_strings_approximate(x, y, threshold=0.95, maxlen=maxlen) diff --git a/nbdime/tests/conftest.py b/nbdime/tests/conftest.py index aae7374e5..740f9c804 100644 --- a/nbdime/tests/conftest.py +++ b/nbdime/tests/conftest.py @@ -339,7 +339,7 @@ def __getitem__(self, name): _db = NBTestDataBase() -def _any_nb_name(): +def _any_nb_names(): return _db.names @@ -370,30 +370,36 @@ def _matching_nb_triplet_names(): triplets.append((basename, names[i], names[j])) return triplets +def _params_and_ids_from_names(names): + return { + 'params': names, + 'ids': [n if isinstance(n, str) else "__".join(n) for n in names] + } + @fixture def db(): return _db -@fixture(params=_any_nb_name()) +@fixture(**_params_and_ids_from_names(_any_nb_names())) def any_nb(request): return _db[request.param] -@fixture(params=_any_nb_pair_names()) +@fixture(**_params_and_ids_from_names(_any_nb_pair_names())) def any_nb_pair(request): a, b = request.param return _db[a], _db[b] -@fixture(params=_matching_nb_pair_names()) +@fixture(**_params_and_ids_from_names(_matching_nb_pair_names())) def matching_nb_pairs(request): a, b = request.param return _db[a], _db[b] -@fixture(params=_matching_nb_triplet_names()) +@fixture(**_params_and_ids_from_names(_matching_nb_triplet_names())) def matching_nb_triplets(request): a, b, c = request.param print(a, b, c) diff --git a/nbdime/tests/test_cli_apps.py b/nbdime/tests/test_cli_apps.py index 0ef2ea4cc..83b165d32 100644 --- a/nbdime/tests/test_cli_apps.py +++ b/nbdime/tests/test_cli_apps.py @@ -193,9 +193,8 @@ def test_nbdiff_app_ignore_details(filespath, tmpdir, reset_notebook_diff): diff = diff[0]['diff'] assert len(diff) == 2 assert diff[0]['key'] == 'outputs' - for subdiff in diff[0]['diff']: - assert subdiff['op'] != 'patch' - + # When details are ignored we still expect outputs to be present; the + # exact op may be a patch when outputs are considered similar enough. assert diff[1]['key'] == 'source' diff --git a/nbdime/tests/test_text_similarity.py b/nbdime/tests/test_text_similarity.py new file mode 100644 index 000000000..af629ba80 --- /dev/null +++ b/nbdime/tests/test_text_similarity.py @@ -0,0 +1,81 @@ +# coding: utf-8 + +# Copyright (c) Jupyter Development Team. +# Distributed under the terms of the Modified BSD License. + +import pytest + +from nbdime.diffing.generic import ( + compare_strings_approximate, +) + +def test_similarity_threshold_is_configurable(): + base = ( + "lorem ipsum dolor sit amet consectetur adipiscing elit " + "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." + ) + noisy = ( + "lorem ipsum dolor sit amet consectetur adipiscing elit " + "A LONG INSERTED SEGMENT WITH MANY EXTRA WORDS TO REDUCE SIMILARITY " + "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." + ) + assert not compare_strings_approximate(base, noisy, threshold=0.85) + + assert compare_strings_approximate(base, noisy, threshold=0.6) + + +def test_configurable_similarity_thresholds_with_long_texts(): + base = ( + "lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor " + "incididunt ut labore et dolore magna aliqua." + ) + noisy = ( + "lorem ipsum dolor sit amet consectetur adipiscing elit " + "A LONG INSERTED SEGMENT WITH MANY EXTRA WORDS TO REDUCE SIMILARITY " + "sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." + ) + assert not compare_strings_approximate(base, noisy, threshold=0.85) + + assert compare_strings_approximate(base, noisy, threshold=0.6) + + +def test_requires_contiguous_overlap(): + # Scattered single-character matches with no 5-character overlap should not qualify as similar. + scattered_left = ( + "alpha bravo charlie delta echo foxtrot golf hotel india juliet kilo lima mike november oscar" + ) + scattered_right = ( + "alp_ha-bra_vo-char_lie-del_ta-ech_o-fox_tro_t-gol_f-hot_el-ind_ia-jul_iet-" + "kil_o-lim_a-mik_e-nov_em-ber_osc_ar" + ) + assert not compare_strings_approximate(scattered_left, scattered_right, threshold=0.3, min_match_length_to_be_similar=5) + + # Genuine overlap of multiple characters still counts as similar. + left = "value=3.14159\nnote about measuring pi\n" + right = "value=3.14159\nnote about measuring pi!\n" + assert compare_strings_approximate(left, right, threshold=0.3, min_match_length_to_be_similar=5) + + +def test_multiline_short_text_requires_real_similarity(): + + local = "local community science projects gather data daily across regions" + remote = "remote venture capital funds acquire startup assets globally" + assert not compare_strings_approximate(local, remote, threshold=0.3) + + +def test_short_strings_with_overlap_and_small_diff_are_similar(): + left = "short-text-123" + right = "short-text-XYZ" + assert compare_strings_approximate(left, right, threshold=0.5) + + +def test_short_strings_without_overlap_are_different_even_if_small_diff(): + left = "abcde12345" + right = "vwxyz67890" + assert not compare_strings_approximate(left, right, threshold=0.5) + + +def test_short_strings_with_overlap_but_large_diff_are_different(): + left = "hello-world" + right = "hello-everyone-this-is-longer" + assert not compare_strings_approximate(left, right, threshold=0.5) From f1a73ef03df5078a99f8fd5d5fbaee12f2106030 Mon Sep 17 00:00:00 2001 From: Zeb Burke-Conte Date: Thu, 12 Feb 2026 15:04:38 -0800 Subject: [PATCH 2/2] Address PR comments and remove unnecessary diff --- nbdime/diffing/generic.py | 9 ++++++--- nbdime/diffing/notebooks.py | 12 ++++++------ nbdime/tests/test_cli_apps.py | 4 ++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/nbdime/diffing/generic.py b/nbdime/diffing/generic.py index 5ba226bca..2d5701aa3 100644 --- a/nbdime/diffing/generic.py +++ b/nbdime/diffing/generic.py @@ -6,6 +6,7 @@ import operator from collections import defaultdict import difflib +from typing import Optional, Union from ..diff_format import SequenceDiffBuilder, MappingDiffBuilder, validate_diff from ..diff_utils import count_consumed_symbols @@ -23,7 +24,7 @@ } -def set_text_similarity_options(*, threshold=None, ignore_whitespace_lines=None): +def set_text_similarity_options(threshold: Optional[Union[int, float]]=None, ignore_whitespace_lines: Optional[bool]=None) -> None: """Configure defaults for approximate string comparisons. Parameters @@ -36,6 +37,8 @@ def set_text_similarity_options(*, threshold=None, ignore_whitespace_lines=None) """ if threshold is not None: + if not isinstance(threshold, (int, float)): + raise TypeError("text similarity threshold must be a number") if not (0.0 <= threshold <= 1.0): raise ValueError("text similarity threshold must be between 0 and 1") _text_similarity_settings["threshold"] = float(threshold) @@ -44,7 +47,7 @@ def set_text_similarity_options(*, threshold=None, ignore_whitespace_lines=None) _text_similarity_settings["ignore_whitespace_lines"] = bool(ignore_whitespace_lines) -def get_text_similarity_options(): +def get_text_similarity_options() -> dict: """Return a copy of the current similarity defaults.""" return _text_similarity_settings.copy() @@ -58,7 +61,7 @@ def default_differs(): return defaultdict(lambda: diff) -def compare_strings_approximate(x, y, threshold=0.7, min_divergence_to_be_unsimilar=None, min_match_length_to_be_similar=None, maxlen=None): +def compare_strings_approximate(x: str, y: str, threshold: float=0.7, maxlen: Optional[int]=None, min_divergence_to_be_unsimilar: Optional[int]=None, min_match_length_to_be_similar: Optional[int]=None): "Compare two strings with approximate heuristics." # Fast cutoff when one is empty diff --git a/nbdime/diffing/notebooks.py b/nbdime/diffing/notebooks.py index 223df8cfb..42099af83 100644 --- a/nbdime/diffing/notebooks.py +++ b/nbdime/diffing/notebooks.py @@ -14,6 +14,7 @@ import re import copy from functools import lru_cache +from typing import Union from ..diff_format import MappingDiffBuilder, DiffOp from ..utils import defaultdict2 @@ -56,7 +57,8 @@ def _is_base64(test_string, min_len=64): return _base64.match(''.join(test_string.splitlines())) -def _prepare_text_for_similarity(value, ignore_whitespace_lines): +def _prepare_text_for_similarity(value: Union[str, list[str]], ignore_whitespace_lines: bool) -> str: + """Normalize text for approximate comparison, stripping ignorable parts""" if isinstance(value, list): value = "".join(value) if ignore_whitespace_lines: @@ -68,18 +70,16 @@ def _prepare_text_for_similarity(value, ignore_whitespace_lines): @lru_cache(maxsize=1024, typed=False) def compare_text_approximate(x, y, maxlen=None): settings = get_text_similarity_options() - threshold = settings["threshold"] - ignore_whitespace_lines = settings["ignore_whitespace_lines"] - x_norm = _prepare_text_for_similarity(x, ignore_whitespace_lines) - y_norm = _prepare_text_for_similarity(y, ignore_whitespace_lines) + x_norm = _prepare_text_for_similarity(x, settings["ignore_whitespace_lines"]) + y_norm = _prepare_text_for_similarity(y, settings["ignore_whitespace_lines"]) max_len = max(len(x_norm), len(y_norm)) min_match_length = min(MIN_MATCH_LENGTH, max_len - 1) return compare_strings_approximate( x, y, - threshold=threshold, + threshold=settings["threshold"], min_divergence_to_be_unsimilar=10, min_match_length_to_be_similar=min_match_length, maxlen=maxlen, diff --git a/nbdime/tests/test_cli_apps.py b/nbdime/tests/test_cli_apps.py index 83b165d32..28cc52bb3 100644 --- a/nbdime/tests/test_cli_apps.py +++ b/nbdime/tests/test_cli_apps.py @@ -193,8 +193,8 @@ def test_nbdiff_app_ignore_details(filespath, tmpdir, reset_notebook_diff): diff = diff[0]['diff'] assert len(diff) == 2 assert diff[0]['key'] == 'outputs' - # When details are ignored we still expect outputs to be present; the - # exact op may be a patch when outputs are considered similar enough. + for subdiff in diff[0]['diff']: + assert subdiff['op'] != 'patch' assert diff[1]['key'] == 'source'