Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions docs/source/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ The current output of `nbdime --config` is:
Ignore: {}
attachments: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
metadata: null
outputs: null
Expand All @@ -45,6 +47,8 @@ The current output of `nbdime --config` is:
base_url: "/"
browser: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
ip: "127.0.0.1"
metadata: null
Expand All @@ -58,6 +62,8 @@ The current output of `nbdime --config` is:
Ignore: {}
attachments: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
ignore_transients: true
input_strategy: null
Expand All @@ -73,6 +79,8 @@ The current output of `nbdime --config` is:
base_url: "/"
browser: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
ignore_transients: true
input_strategy: null
Expand Down Expand Up @@ -106,6 +114,8 @@ The current output of `nbdime --config` is:
Ignore: {}
attachments: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
metadata: null
outputs: null
Expand All @@ -115,6 +125,8 @@ The current output of `nbdime --config` is:
Ignore: {}
attachments: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
metadata: null
outputs: null
Expand All @@ -126,6 +138,8 @@ The current output of `nbdime --config` is:
base_url: "/"
browser: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
ip: "127.0.0.1"
metadata: null
Expand All @@ -139,6 +153,8 @@ The current output of `nbdime --config` is:
Ignore: {}
attachments: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
ignore_transients: true
input_strategy: null
Expand All @@ -154,6 +170,8 @@ The current output of `nbdime --config` is:
base_url: "/"
browser: null
color_words: false
text_similarity_ignore_whitespace: true
text_similarity_threshold: 0.3
details: null
ignore_transients: true
input_strategy: null
Expand Down
25 changes: 25 additions & 0 deletions nbdime/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
get_defaults_for_argparse, build_config, entrypoint_configurables,
Namespace
)
from .diffing.generic import set_text_similarity_options
from .diffing.notebooks import set_notebook_diff_targets, set_notebook_diff_ignores
from .gitfiles import is_gitref
from .ignorables import diff_ignorables
Expand Down Expand Up @@ -338,6 +339,25 @@ def add_diff_args(parser):
action=IgnorableAction,
help="process/ignore details not covered by other options.")

similarity = parser.add_argument_group(
title='similarity',
description='Control how text similarity is estimated when aligning cells.')
similarity.add_argument(
'--text-similarity-threshold',
dest='text_similarity_threshold',
metavar='RATIO',
type=float,
default=0.3,
help='minimum ratio (0-1) required to consider two text blocks similar',
)
similarity.add_argument(
'--no-text-similarity-ignore-whitespace',
dest='text_similarity_ignore_whitespace',
action='store_false',
default=True,
help='do not drop whitespace-only lines before computing similarity',
)


def add_diff_cli_args(parser):
"""Adds a set of arguments for CLI diff commands (i.e. not web).
Expand Down Expand Up @@ -405,6 +425,11 @@ def process_diff_flags(args):
args.sources, args.outputs, args.attachments, args.metadata,
args.id, args.details)

set_text_similarity_options(
threshold=getattr(args, 'text_similarity_threshold', None),
ignore_whitespace_lines=getattr(args, 'text_similarity_ignore_whitespace', None),
)


def resolve_diff_args(args):
"""Resolve ambiguity of path vs base/remote for git:
Expand Down
15 changes: 14 additions & 1 deletion nbdime/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from jupyter_core.paths import jupyter_config_path

from traitlets import Unicode, Enum, Integer, Bool, HasTraits, Dict, TraitError
from traitlets import Unicode, Enum, Integer, Bool, Float, HasTraits, Dict, TraitError
from traitlets.config.loader import JSONFileConfigLoader, ConfigFileNotFound

from .merging.notebooks import (
Expand Down Expand Up @@ -221,6 +221,19 @@ class _Diffing(_Ignorables):
"to git diff"),
).tag(config=True)

text_similarity_threshold = Float(
0.3,
min=0.0,
max=1.0,
help=("minimum ratio (0-1) for considering two text blocks similar "
"when aligning cells and outputs"),
).tag(config=True)

text_similarity_ignore_whitespace = Bool(
True,
help=("ignore whitespace-only lines when estimating text similarity"),
).tag(config=True)


class Diff(_Diffing):
pass
Expand Down
60 changes: 55 additions & 5 deletions nbdime/diffing/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import operator
from collections import defaultdict
import difflib
from typing import Optional, Union

from ..diff_format import SequenceDiffBuilder, MappingDiffBuilder, validate_diff
from ..diff_utils import count_consumed_symbols
Expand All @@ -17,6 +18,41 @@
__all__ = ["diff"]


_text_similarity_settings = {
"threshold": 0.3,
"ignore_whitespace_lines": True,
}


def set_text_similarity_options(threshold: Optional[Union[int, float]]=None, ignore_whitespace_lines: Optional[bool]=None) -> None:
"""Configure defaults for approximate string comparisons.

Parameters
----------
threshold: float, optional
Minimum difflib ratio (0-1) used to consider strings similar when no
explicit threshold is provided.
ignore_whitespace_lines: bool, optional
Whether to drop whitespace-only lines before computing similarity.
"""

if threshold is not None:
if not isinstance(threshold, (int, float)):
raise TypeError("text similarity threshold must be a number")
if not (0.0 <= threshold <= 1.0):
raise ValueError("text similarity threshold must be between 0 and 1")
_text_similarity_settings["threshold"] = float(threshold)

if ignore_whitespace_lines is not None:
_text_similarity_settings["ignore_whitespace_lines"] = bool(ignore_whitespace_lines)


def get_text_similarity_options() -> dict:
"""Return a copy of the current similarity defaults."""

return _text_similarity_settings.copy()


def default_predicates():
return defaultdict(lambda: (operator.__eq__,))

Expand All @@ -25,10 +61,8 @@ def default_differs():
return defaultdict(lambda: diff)


def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
"Compare to strings with approximate heuristics."
# TODO: Add configuration framework
# TODO: Tune threshold with realistic sources
def compare_strings_approximate(x: str, y: str, threshold: float=0.7, maxlen: Optional[int]=None, min_divergence_to_be_unsimilar: Optional[int]=None, min_match_length_to_be_similar: Optional[int]=None):
"Compare two strings with approximate heuristics."

# Fast cutoff when one is empty
if bool(x) != bool(y):
Expand All @@ -38,6 +72,12 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
# and lists of strings also works fine
if len(x) == len(y) and x == y:
return True

if min_divergence_to_be_unsimilar is not None and len(x) <= min_divergence_to_be_unsimilar and len(y) <= min_divergence_to_be_unsimilar:
return True

if min_match_length_to_be_similar is not None and (len(x) < min_match_length_to_be_similar or len(y) < min_match_length_to_be_similar):
return False

# TODO: Investigate performance and quality of this difflib ratio approach,
# possibly one of the weakest links of the notebook diffing algorithm.
Expand All @@ -57,6 +97,9 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
# s = difflib.SequenceMatcher(lambda c: c in (" ", "\t"), x, y, autojunk=False)
s = difflib.SequenceMatcher(None, x, y, autojunk=False)

if min_divergence_to_be_unsimilar is not None:
threshold = max(threshold, min_divergence_to_be_unsimilar / max(len(x), len(y)))

# Use only the fast ratio approximations first
if s.real_quick_ratio() < threshold:
return False
Expand All @@ -67,7 +110,14 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
# We know from above that there is not an exact similarity
return False

return s.ratio() > threshold
if not s.ratio() > threshold:
return False

if min_match_length_to_be_similar is not None:
longest = max((m.size for m in s.get_matching_blocks()), default=0)
return longest >= min_match_length_to_be_similar
else:
return True


def diff(a, b, path="", config=None):
Expand Down
51 changes: 26 additions & 25 deletions nbdime/diffing/notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@
import re
import copy
from functools import lru_cache
from typing import Union

from ..diff_format import MappingDiffBuilder, DiffOp
from ..utils import defaultdict2

from .config import DiffConfig
from .generic import (
diff, diff_sequence_multilevel, compare_strings_approximate,
diff_string_lines,
diff_string_lines, get_text_similarity_options,
)

__all__ = ["diff_notebooks"]
Expand All @@ -37,6 +38,7 @@

TEXT_MIMEDATA_MAX_COMPARE_LENGTH = 10000
STREAM_MAX_COMPARE_LENGTH = 1000
MIN_MATCH_LENGTH = 5


# List of mimes we can diff recursively
Expand All @@ -55,40 +57,39 @@ def _is_base64(test_string, min_len=64):
return _base64.match(''.join(test_string.splitlines()))


# TODO: Maybe cleaner to make the split between strict/approximate
# an argument instead of separate functions.
def _prepare_text_for_similarity(value: Union[str, list[str]], ignore_whitespace_lines: bool) -> str:
"""Normalize text for approximate comparison, stripping ignorable parts"""
if isinstance(value, list):
value = "".join(value)
if ignore_whitespace_lines:
lines = value.splitlines(True)
value = "".join(line for line in lines if line.strip())
return value


@lru_cache(maxsize=1024, typed=False)
def compare_text_approximate(x, y, maxlen=None):
# Fast cutoff when one is empty
if bool(x) != bool(y):
return False
settings = get_text_similarity_options()

if isinstance(x, list):
x = "".join(x)
if isinstance(y, list):
y = "".join(y)

# TODO: Review whether this is wanted.
# The motivation is to align tiny
# strings in outputs such as a single number.
# Allow aligning short strings without comparison
nx = len(x)
ny = len(y)
shortlen = 10 # TODO: Add this to configuration framework
if nx < shortlen and ny < shortlen:
return True
x_norm = _prepare_text_for_similarity(x, settings["ignore_whitespace_lines"])
y_norm = _prepare_text_for_similarity(y, settings["ignore_whitespace_lines"])

return compare_strings_approximate(x, y, threshold=0.7, maxlen=maxlen)
max_len = max(len(x_norm), len(y_norm))
min_match_length = min(MIN_MATCH_LENGTH, max_len - 1)

return compare_strings_approximate(
x, y,
threshold=settings["threshold"],
min_divergence_to_be_unsimilar=10,
min_match_length_to_be_similar=min_match_length,
maxlen=maxlen,
)


def compare_text_strict(x, y, maxlen=None):
# TODO: Doesn't have to be 100% equal here?
if isinstance(x, list):
x = "".join(x)
if isinstance(y, list):
y = "".join(y)
x = _prepare_text_for_similarity(x, False)
y = _prepare_text_for_similarity(y, False)
if len(x) == len(y) and x == y:
return True
return compare_strings_approximate(x, y, threshold=0.95, maxlen=maxlen)
Expand Down
16 changes: 11 additions & 5 deletions nbdime/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,7 +339,7 @@ def __getitem__(self, name):
_db = NBTestDataBase()


def _any_nb_name():
def _any_nb_names():
return _db.names


Expand Down Expand Up @@ -370,30 +370,36 @@ def _matching_nb_triplet_names():
triplets.append((basename, names[i], names[j]))
return triplets

def _params_and_ids_from_names(names):
return {
'params': names,
'ids': [n if isinstance(n, str) else "__".join(n) for n in names]
}


@fixture
def db():
return _db


@fixture(params=_any_nb_name())
@fixture(**_params_and_ids_from_names(_any_nb_names()))
def any_nb(request):
return _db[request.param]


@fixture(params=_any_nb_pair_names())
@fixture(**_params_and_ids_from_names(_any_nb_pair_names()))
def any_nb_pair(request):
a, b = request.param
return _db[a], _db[b]


@fixture(params=_matching_nb_pair_names())
@fixture(**_params_and_ids_from_names(_matching_nb_pair_names()))
def matching_nb_pairs(request):
a, b = request.param
return _db[a], _db[b]


@fixture(params=_matching_nb_triplet_names())
@fixture(**_params_and_ids_from_names(_matching_nb_triplet_names()))
def matching_nb_triplets(request):
a, b, c = request.param
print(a, b, c)
Expand Down
Loading