CAVEconnectome
diff --git a/‎pychunkedgraph/__init__.py‎
Lines changed: 62 additions & 0 deletions b/‎pychunkedgraph/__init__.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎pychunkedgraph/debug/profiler.py‎
Lines changed: 121 additions & 0 deletions b/‎pychunkedgraph/debug/profiler.py‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎pychunkedgraph/debug/utils.py‎
Lines changed: 17 additions & 0 deletions b/‎pychunkedgraph/debug/utils.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎pychunkedgraph/graph/cache.py‎
Lines changed: 92 additions & 0 deletions b/‎pychunkedgraph/graph/cache.py‎
Lines changed: 92 additions & 0 deletions
@@ -1 +1,63 @@
 __version__ = "3.1.6"
+
+import sys
+import warnings
+import logging as stdlib_logging  # Use alias to avoid conflict with pychunkedgraph.logging
+
+# Suppress annoying warning from python_jsonschema_objects dependency
+warnings.filterwarnings(
+    "ignore", message="Schema id not specified", module="python_jsonschema_objects"
+)
+
+# Export logging levels for convenience
+DEBUG = stdlib_logging.DEBUG
+INFO = stdlib_logging.INFO
+WARNING = stdlib_logging.WARNING
+ERROR = stdlib_logging.ERROR
+
+# Set up library-level logger with NullHandler (Python logging best practice)
+stdlib_logging.getLogger(__name__).addHandler(stdlib_logging.NullHandler())
+
+
+def configure_logging(level=stdlib_logging.INFO, format_str=None, stream=None):
+    """
+    Configure logging for pychunkedgraph. Call this to enable log output.
+
+    Works in Jupyter notebooks and scripts.
+
+    Args:
+        level: Logging level (default: INFO). Use pychunkedgraph.DEBUG, .INFO, .WARNING, .ERROR
+        format_str: Custom format string (optional)
+        stream: Output stream (default: sys.stdout for Jupyter compatibility)
+
+    Example:
+        import pychunkedgraph
+        pychunkedgraph.configure_logging()  # Enable INFO level logging
+        pychunkedgraph.configure_logging(pychunkedgraph.DEBUG)  # Enable DEBUG level
+    """
+    if format_str is None:
+        format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    if stream is None:
+        stream = sys.stdout
+
+    # Get root logger for pychunkedgraph
+    logger = stdlib_logging.getLogger(__name__)
+    logger.setLevel(level)
+
+    # Remove existing handlers and add fresh StreamHandler
+    # This allows reconfiguring with different levels/formats
+    for h in logger.handlers[:]:
+        if isinstance(h, stdlib_logging.StreamHandler) and not isinstance(
+            h, stdlib_logging.NullHandler
+        ):
+            logger.removeHandler(h)
+
+    handler = stdlib_logging.StreamHandler(stream)
+    handler.setLevel(level)
+    handler.setFormatter(stdlib_logging.Formatter(format_str))
+    logger.addHandler(handler)
+
+    return logger
+
+
+configure_logging()
@@ -0,0 +1,121 @@
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import os
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+
+
+class HierarchicalProfiler:
+    """
+    Hierarchical profiler for detailed timing breakdowns.
+    Tracks timing at multiple levels and prints a breakdown at the end.
+    """
+
+    def __init__(self, enabled: bool = True):
+        self.enabled = enabled
+        self.timings: Dict[str, List[float]] = defaultdict(list)
+        self.call_counts: Dict[str, int] = defaultdict(int)
+        self.stack: List[Tuple[str, float]] = []
+        self.current_path: List[str] = []
+
+    @contextmanager
+    def profile(self, name: str):
+        """Context manager for profiling a code block."""
+        if not self.enabled:
+            yield
+            return
+
+        full_path = ".".join(self.current_path + [name])
+        self.current_path.append(name)
+        start_time = time.perf_counter()
+
+        try:
+            yield
+        finally:
+            elapsed = time.perf_counter() - start_time
+            self.timings[full_path].append(elapsed)
+            self.call_counts[full_path] += 1
+            self.current_path.pop()
+
+    def print_report(self, operation_id=None):
+        """Print a detailed timing breakdown."""
+        if not self.enabled or not self.timings:
+            return
+
+        print("\n" + "=" * 80)
+        print(
+            f"PROFILER REPORT{f' (operation_id={operation_id})' if operation_id else ''}"
+        )
+        print("=" * 80)
+
+        # Group by depth level
+        by_depth: Dict[int, List[Tuple[str, float, int]]] = defaultdict(list)
+        for path, times in self.timings.items():
+            depth = path.count(".")
+            total_time = sum(times)
+            count = self.call_counts[path]
+            by_depth[depth].append((path, total_time, count))
+
+        # Sort each level by total time
+        for depth in sorted(by_depth.keys()):
+            items = sorted(by_depth[depth], key=lambda x: -x[1])
+            for path, total_time, count in items:
+                indent = "  " * depth
+                avg_time = total_time / count if count > 0 else 0
+                if count > 1:
+                    print(
+                        f"{indent}{path}: {total_time*1000:.2f}ms total "
+                        f"({count} calls, {avg_time*1000:.2f}ms avg)"
+                    )
+                else:
+                    print(f"{indent}{path}: {total_time*1000:.2f}ms")
+
+        # Print summary
+        print("-" * 80)
+        top_level_total = sum(
+            sum(times) for path, times in self.timings.items() if "." not in path
+        )
+        print(f"Total top-level time: {top_level_total*1000:.2f}ms")
+
+        # Print top 10 slowest operations
+        print("\nTop 10 slowest operations:")
+        all_ops = [
+            (path, sum(times), self.call_counts[path])
+            for path, times in self.timings.items()
+        ]
+        all_ops.sort(key=lambda x: -x[1])
+        for i, (path, total_time, count) in enumerate(all_ops[:10]):
+            pct = (total_time / top_level_total * 100) if top_level_total > 0 else 0
+            print(f"  {i+1}. {path}: {total_time*1000:.2f}ms ({pct:.1f}%)")
+
+        print("=" * 80 + "\n")
+
+    def reset(self):
+        """Reset all timing data."""
+        self.timings.clear()
+        self.call_counts.clear()
+        self.stack.clear()
+        self.current_path.clear()
+
+
+# Global profiler instance - enable via environment variable
+PROFILER_ENABLED = os.environ.get("PCG_PROFILER_ENABLED", "1") == "1"
+_profiler: HierarchicalProfiler = None
+
+
+def get_profiler() -> HierarchicalProfiler:
+    """Get or create the global profiler instance."""
+    global _profiler
+    if _profiler is None:
+        _profiler = HierarchicalProfiler(enabled=PROFILER_ENABLED)
+    return _profiler
+
+
+def reset_profiler():
+    """Reset the global profiler."""
+    global _profiler
+    if _profiler is not None:
+        _profiler.reset()
@@ -56,3 +56,20 @@ def update_graph_id(cg, new_graph_id:str):
     new_gc = GraphConfig(**old_gc)
     new_meta = ChunkedGraphMeta(new_gc, cg.meta.data_source, cg.meta.custom_data)
     cg.update_meta(new_meta, overwrite=True)
+
+
+def get_random_l1_ids(cg, n_chunks=100, n_per_chunk=10, seed=None):
+    """Generate random layer 1 IDs from different chunks."""
+    if seed:
+        np.random.seed(seed)
+    bounds = cg.meta.layer_chunk_bounds[2]
+    ids = []
+    for _ in range(n_chunks):
+        cx, cy, cz = [np.random.randint(0, b) for b in bounds]
+        chunk_id = cg.get_chunk_id(layer=2, x=cx, y=cy, z=cz)
+        max_seg = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id))
+        if max_seg < 2:
+            continue
+        for seg in np.random.randint(1, max_seg + 1, n_per_chunk):
+            ids.append(cg.get_node_id(np.uint64(seg), np.uint64(chunk_id)))
+    return np.array(ids, dtype=np.uint64)
@@ -2,6 +2,8 @@
 """
 Cache nodes, parents, children and cross edges.
 """
+import traceback
+from collections import defaultdict
 from sys import maxsize
 from datetime import datetime
 
@@ -40,6 +42,30 @@ def __init__(self, cg):
         self.children_cache = LRUCache(maxsize=maxsize)
         self.cross_chunk_edges_cache = LRUCache(maxsize=maxsize)
 
+        # Stats tracking for cache hits/misses
+        self.stats = {
+            "parents": {"hits": 0, "misses": 0, "calls": 0},
+            "children": {"hits": 0, "misses": 0, "calls": 0},
+            "cross_chunk_edges": {"hits": 0, "misses": 0, "calls": 0},
+        }
+        # Track where calls/misses come from
+        self.call_sources = defaultdict(lambda: defaultdict(lambda: {"calls": 0, "misses": 0}))
+
+    def _get_caller(self, skip_frames=2):
+        """Get caller info (filename:line:function)."""
+        stack = traceback.extract_stack()
+        # Skip frames: _get_caller, the cache method, and go to actual caller
+        if len(stack) > skip_frames:
+            frame = stack[-(skip_frames + 1)]
+            return f"{frame.filename.split('/')[-1]}:{frame.lineno}:{frame.name}"
+        return "unknown"
+
+    def _record_call(self, cache_type, misses=0):
+        """Record a call and its source."""
+        caller = self._get_caller(skip_frames=3)
+        self.call_sources[cache_type][caller]["calls"] += 1
+        self.call_sources[cache_type][caller]["misses"] += misses
+
     def __len__(self):
         return (
             len(self.parents_cache)
@@ -52,14 +78,53 @@ def clear(self):
         self.children_cache.clear()
         self.cross_chunk_edges_cache.clear()
 
+    def get_stats(self):
+        """Return stats with hit rates calculated."""
+        result = {}
+        for name, s in self.stats.items():
+            total = s["hits"] + s["misses"]
+            hit_rate = s["hits"] / total if total > 0 else 0
+            result[name] = {
+                **s,
+                "total": total,
+                "hit_rate": f"{hit_rate:.1%}",
+                "sources": dict(self.call_sources[name]),
+            }
+        return result
+
+    def reset_stats(self):
+        for s in self.stats.values():
+            s["hits"] = 0
+            s["misses"] = 0
+            s["calls"] = 0
+        self.call_sources.clear()
+
     def parent(self, node_id: np.uint64, *, time_stamp: datetime = None):
+        self.stats["parents"]["calls"] += 1
+        is_cached = node_id in self.parents_cache
+        miss_count = 0 if is_cached else 1
+        if is_cached:
+            self.stats["parents"]["hits"] += 1
+        else:
+            self.stats["parents"]["misses"] += 1
+        self._record_call("parents", misses=miss_count)
+
         @cached(cache=self.parents_cache, key=lambda node_id: node_id)
         def parent_decorated(node_id):
             return self._cg.get_parent(node_id, raw_only=True, time_stamp=time_stamp)
 
         return parent_decorated(node_id)
 
     def children(self, node_id):
+        self.stats["children"]["calls"] += 1
+        is_cached = node_id in self.children_cache
+        miss_count = 0 if is_cached else 1
+        if is_cached:
+            self.stats["children"]["hits"] += 1
+        else:
+            self.stats["children"]["misses"] += 1
+        self._record_call("children", misses=miss_count)
+
         @cached(cache=self.children_cache, key=lambda node_id: node_id)
         def children_decorated(node_id):
             children = self._cg.get_children(node_id, raw_only=True)
@@ -69,6 +134,15 @@ def children_decorated(node_id):
         return children_decorated(node_id)
 
     def cross_chunk_edges(self, node_id, *, time_stamp: datetime = None):
+        self.stats["cross_chunk_edges"]["calls"] += 1
+        is_cached = node_id in self.cross_chunk_edges_cache
+        miss_count = 0 if is_cached else 1
+        if is_cached:
+            self.stats["cross_chunk_edges"]["hits"] += 1
+        else:
+            self.stats["cross_chunk_edges"]["misses"] += 1
+        self._record_call("cross_chunk_edges", misses=miss_count)
+
         @cached(cache=self.cross_chunk_edges_cache, key=lambda node_id: node_id)
         def cross_edges_decorated(node_id):
             edges = self._cg.get_cross_chunk_edges(
@@ -82,7 +156,13 @@ def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None)
         node_ids = np.array(node_ids, dtype=NODE_ID, copy=False)
         if not node_ids.size:
             return node_ids
+        self.stats["parents"]["calls"] += 1
         mask = np.in1d(node_ids, np.fromiter(self.parents_cache.keys(), dtype=NODE_ID))
+        hits = int(np.sum(mask))
+        misses = len(node_ids) - hits
+        self.stats["parents"]["hits"] += hits
+        self.stats["parents"]["misses"] += misses
+        self._record_call("parents", misses=misses)
         parents = node_ids.copy()
         parents[mask] = self._parent_vec(node_ids[mask])
         parents[~mask] = self._cg.get_parents(
@@ -96,7 +176,13 @@ def children_multiple(self, node_ids: np.ndarray, *, flatten=False):
         node_ids = np.array(node_ids, dtype=NODE_ID, copy=False)
         if not node_ids.size:
             return result
+        self.stats["children"]["calls"] += 1
         mask = np.in1d(node_ids, np.fromiter(self.children_cache.keys(), dtype=NODE_ID))
+        hits = int(np.sum(mask))
+        misses = len(node_ids) - hits
+        self.stats["children"]["hits"] += hits
+        self.stats["children"]["misses"] += misses
+        self._record_call("children", misses=misses)
         cached_children_ = self._children_vec(node_ids[mask])
         result.update({id_: c_ for id_, c_ in zip(node_ids[mask], cached_children_)})
         result.update(self._cg.get_children(node_ids[~mask], raw_only=True))
@@ -114,9 +200,15 @@ def cross_chunk_edges_multiple(
         node_ids = np.array(node_ids, dtype=NODE_ID, copy=False)
         if not node_ids.size:
             return result
+        self.stats["cross_chunk_edges"]["calls"] += 1
         mask = np.in1d(
             node_ids, np.fromiter(self.cross_chunk_edges_cache.keys(), dtype=NODE_ID)
         )
+        hits = int(np.sum(mask))
+        misses = len(node_ids) - hits
+        self.stats["cross_chunk_edges"]["hits"] += hits
+        self.stats["cross_chunk_edges"]["misses"] += misses
+        self._record_call("cross_chunk_edges", misses=misses)
         cached_edges_ = self._cross_chunk_edges_vec(node_ids[mask])
         result.update(
             {id_: edges_ for id_, edges_ in zip(node_ids[mask], cached_edges_)}