diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 1e048a94a..2a9dad726 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.18.3 +current_version = 3.0.10 commit = True tag = True diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 899f0431f..fd20bf4b7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -4,9 +4,11 @@ on: push: branches: - "main" + - "pcgv3" pull_request: branches: - "main" + - "pcgv3" jobs: unit-tests: diff --git a/pychunkedgraph/__init__.py b/pychunkedgraph/__init__.py index 819c3f307..84994dc59 100644 --- a/pychunkedgraph/__init__.py +++ b/pychunkedgraph/__init__.py @@ -1 +1 @@ -__version__ = "2.18.3" +__version__ = "3.0.10" diff --git a/pychunkedgraph/app/__init__.py b/pychunkedgraph/app/__init__.py index 3e938628b..262849258 100644 --- a/pychunkedgraph/app/__init__.py +++ b/pychunkedgraph/app/__init__.py @@ -105,6 +105,8 @@ def configure_app(app): with app.app_context(): from ..ingest.rq_cli import init_rq_cmds from ..ingest.cli import init_ingest_cmds + from ..ingest.cli_upgrade import init_upgrade_cmds init_rq_cmds(app) init_ingest_cmds(app) + init_upgrade_cmds(app) diff --git a/pychunkedgraph/debug/cross_edge_test.py b/pychunkedgraph/debug/cross_edge_test.py deleted file mode 100644 index 25bacfa0b..000000000 --- a/pychunkedgraph/debug/cross_edge_test.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -from datetime import datetime -import numpy as np - -from pychunkedgraph.graph import chunkedgraph -from pychunkedgraph.graph import attributes - -#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/svenmd/.cloudvolume/secrets/google-secret.json" - -layer = 2 -n_chunks = 1000 -n_segments_per_chunk = 200 -# timestamp = datetime.datetime.fromtimestamp(1588875769) -timestamp = datetime.utcnow() - -cg = chunkedgraph.ChunkedGraph(graph_id="pinky_nf_v2") - -np.random.seed(42) - -node_ids = [] -for _ in range(n_chunks): - c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) - c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) - c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) - - chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) - - max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) - - if max_segment_id < 10: - continue - - segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) - - for segment_id in segment_ids: - node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) - -rows = cg.client.read_nodes(node_ids=node_ids, end_time=timestamp, - properties=attributes.Hierarchy.Parent) -valid_node_ids = [] -non_valid_node_ids = [] -for k in rows.keys(): - if len(rows[k]) > 0: - valid_node_ids.append(k) - else: - non_valid_node_ids.append(k) - -cc_edges = cg.get_atomic_cross_edges(valid_node_ids) -cc_ids = np.unique(np.concatenate([np.concatenate(list(v.values())) for v in list(cc_edges.values()) if len(v.values())])) - -roots = cg.get_roots(cc_ids) -root_dict = dict(zip(cc_ids, roots)) -root_dict_vec = np.vectorize(root_dict.get) - -for k in cc_edges: - if len(cc_edges[k]) == 0: - continue - local_ids = np.unique(np.concatenate(list(cc_edges[k].values()))) - - assert len(np.unique(root_dict_vec(local_ids))) \ No newline at end of file diff --git a/pychunkedgraph/debug/existence_test.py b/pychunkedgraph/debug/existence_test.py deleted file mode 100644 index 757d3d542..000000000 --- a/pychunkedgraph/debug/existence_test.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -from datetime import datetime -import numpy as np - -from pychunkedgraph.graph import chunkedgraph -from pychunkedgraph.graph import attributes - -#os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/svenmd/.cloudvolume/secrets/google-secret.json" - -layer = 2 -n_chunks = 100 -n_segments_per_chunk = 200 -# timestamp = datetime.datetime.fromtimestamp(1588875769) -timestamp = datetime.utcnow() - -cg = chunkedgraph.ChunkedGraph(graph_id="pinky_nf_v2") - -np.random.seed(42) - -node_ids = [] -for _ in range(n_chunks): - c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) - c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) - c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) - - chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) - - max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) - - if max_segment_id < 10: - continue - - segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) - - for segment_id in segment_ids: - node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) - -rows = cg.client.read_nodes(node_ids=node_ids, end_time=timestamp, - properties=attributes.Hierarchy.Parent) -valid_node_ids = [] -non_valid_node_ids = [] -for k in rows.keys(): - if len(rows[k]) > 0: - valid_node_ids.append(k) - else: - non_valid_node_ids.append(k) - -roots = cg.get_roots(valid_node_ids, time_stamp=timestamp) - -roots = [] -try: - roots = cg.get_roots(valid_node_ids) - assert len(roots) == len(valid_node_ids) - print(f"ALL {len(roots)} have been successful!") -except: - print("At least one node failed. Checking nodes one by one now") - -if len(roots) != len(valid_node_ids): - log_dict = {} - success_dict = {} - for node_id in valid_node_ids: - try: - root = cg.get_root(node_id, time_stamp=timestamp) - print(f"Success: {node_id} from chunk {cg.get_chunk_id(node_id)}") - success_dict[node_id] = True - except Exception as e: - print(f"{node_id} from chunk {cg.get_chunk_id(node_id)} failed with {e}") - success_dict[node_id] = False - - t_id = node_id - - while t_id is not None: - last_working_chunk = cg.get_chunk_id(t_id) - t_id = cg.get_parent(t_id) - - print(f"Failed on layer {cg.get_chunk_layer(last_working_chunk)} in chunk {last_working_chunk}") - log_dict[node_id] = last_working_chunk - diff --git a/pychunkedgraph/debug/family_test.py b/pychunkedgraph/debug/family_test.py deleted file mode 100644 index 198351e74..000000000 --- a/pychunkedgraph/debug/family_test.py +++ /dev/null @@ -1,54 +0,0 @@ -import os -from datetime import datetime -import numpy as np - -from pychunkedgraph.graph import chunkedgraph -from pychunkedgraph.graph import attributes - -# os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/svenmd/.cloudvolume/secrets/google-secret.json" - -layers = [2, 3, 4, 5, 6, 7] -n_chunks = 10 -n_segments_per_chunk = 200 -# timestamp = datetime.datetime.fromtimestamp(1588875769) -timestamp = datetime.utcnow() - -cg = chunkedgraph.ChunkedGraph(graph_id="pinky_nf_v2") - -np.random.seed(42) - -node_ids = [] - -for layer in layers: - for _ in range(n_chunks): - c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) - c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) - c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) - - chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) - - max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) - - if max_segment_id < 10: - continue - - segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) - - for segment_id in segment_ids: - node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) - -rows = cg.client.read_nodes(node_ids=node_ids, end_time=timestamp, - properties=attributes.Hierarchy.Parent) -valid_node_ids = [] -non_valid_node_ids = [] -for k in rows.keys(): - if len(rows[k]) > 0: - valid_node_ids.append(k) - else: - non_valid_node_ids.append(k) - -parents = cg.get_parents(valid_node_ids, time_stamp=timestamp) -children_dict = cg.get_children(parents) - -for child, parent in zip(valid_node_ids, parents): - assert child in children_dict[parent] \ No newline at end of file diff --git a/pychunkedgraph/debug/utils.py b/pychunkedgraph/debug/utils.py index 179f50aef..130d85500 100644 --- a/pychunkedgraph/debug/utils.py +++ b/pychunkedgraph/debug/utils.py @@ -1,7 +1,8 @@ +# pylint: disable=invalid-name, missing-docstring, bare-except, unidiomatic-typecheck + import numpy as np -from ..graph import ChunkedGraph -from ..graph.utils.basetypes import NODE_ID +from pychunkedgraph.graph.meta import ChunkedGraphMeta, GraphConfig def print_attrs(d): @@ -16,24 +17,19 @@ def print_attrs(d): print(v) -def print_node( - cg: ChunkedGraph, - node: NODE_ID, - indent: int = 0, - stop_layer: int = 2, -) -> None: +def print_node(cg, node: np.uint64, indent: int = 0, stop_layer: int = 2) -> None: children = cg.get_children(node) print(f"{' ' * indent}{node}[{len(children)}]") if cg.get_chunk_layer(node) <= stop_layer: return for child in children: - print_node(cg, child, indent=indent + 1, stop_layer=stop_layer) + print_node(cg, child, indent=indent + 4, stop_layer=stop_layer) -def get_l2children(cg: ChunkedGraph, node: NODE_ID) -> np.ndarray: - nodes = np.array([node], dtype=NODE_ID) +def get_l2children(cg, node: np.uint64) -> np.ndarray: + nodes = np.array([node], dtype=np.uint64) layers = cg.get_chunk_layers(nodes) - assert np.all(layers > 2), "nodes must be at layers > 2" + assert np.all(layers >= 2), "nodes must be at layers >= 2" l2children = [] while nodes.size: children = cg.get_children(nodes, flatten=True) @@ -41,3 +37,35 @@ def get_l2children(cg: ChunkedGraph, node: NODE_ID) -> np.ndarray: l2children.append(children[layers == 2]) nodes = children[layers > 2] return np.concatenate(l2children) + + +def sanity_check(cg, new_roots, operation_id): + """ + Check for duplicates in hierarchy, useful for debugging. + """ + # print(f"{len(new_roots)} new ids from {operation_id}") + l2c_d = {} + for new_root in new_roots: + l2c_d[new_root] = get_l2children(cg, new_root) + success = True + for k, v in l2c_d.items(): + success = success and (len(v) == np.unique(v).size) + # print(f"{k}: {np.unique(v).size}, {len(v)}") + if not success: + raise RuntimeError("Some ids are not valid.") + + +def sanity_check_single(cg, node, operation_id): + v = get_l2children(cg, node) + msg = f"invalid node {node}:" + msg += f" found {len(v)} l2 ids, must be {np.unique(v).size}" + assert np.unique(v).size == len(v), f"{msg}, from {operation_id}." + return v + + +def update_graph_id(cg, new_graph_id:str): + old_gc = cg.meta.graph_config._asdict() + old_gc["ID"] = new_graph_id + new_gc = GraphConfig(**old_gc) + new_meta = ChunkedGraphMeta(new_gc, cg.meta.data_source, cg.meta.custom_data) + cg.update_meta(new_meta, overwrite=True) diff --git a/pychunkedgraph/graph/attributes.py b/pychunkedgraph/graph/attributes.py index 3e48d204a..b431a159b 100644 --- a/pychunkedgraph/graph/attributes.py +++ b/pychunkedgraph/graph/attributes.py @@ -1,6 +1,9 @@ +# pylint: disable=invalid-name, missing-docstring, protected-access, raise-missing-from + # TODO design to use these attributes across different clients # `family_id` is specific to bigtable +from enum import Enum from typing import NamedTuple from .utils import serializers @@ -101,7 +104,7 @@ class Connectivity: serializer=serializers.NumPyArray(dtype=basetypes.EDGE_AREA), ) - CrossChunkEdge = _AttributeArray( + AtomicCrossChunkEdge = _AttributeArray( pattern=b"atomic_cross_edges_%d", family_id="3", serializer=serializers.NumPyArray( @@ -109,12 +112,26 @@ class Connectivity: ), ) - FakeEdges = _Attribute( + CrossChunkEdge = _AttributeArray( + pattern=b"cross_edges_%d", + family_id="4", + serializer=serializers.NumPyArray( + dtype=basetypes.NODE_ID, shape=(-1, 2), compression_level=22 + ), + ) + + FakeEdgesCF3 = _Attribute( key=b"fake_edges", family_id="3", serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), ) + FakeEdges = _Attribute( + key=b"fake_edges", + family_id="4", + serializer=serializers.NumPyArray(dtype=basetypes.NODE_ID, shape=(-1, 2)), + ) + class Hierarchy: Child = _Attribute( @@ -157,8 +174,6 @@ class GraphVersion: class OperationLogs: key = b"ioperations" - from enum import Enum - class StatusCodes(Enum): SUCCESS = 0 # all is well, new changes persisted CREATED = 1 # log record created in storage diff --git a/pychunkedgraph/graph/cache.py b/pychunkedgraph/graph/cache.py index f60b6ca92..13fa962ae 100644 --- a/pychunkedgraph/graph/cache.py +++ b/pychunkedgraph/graph/cache.py @@ -1,3 +1,4 @@ +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel """ Cache nodes, parents, children and cross edges. """ @@ -30,26 +31,26 @@ def __init__(self, cg): self._parent_vec = np.vectorize(self.parent, otypes=[np.uint64]) self._children_vec = np.vectorize(self.children, otypes=[np.ndarray]) - self._atomic_cross_edges_vec = np.vectorize( - self.atomic_cross_edges, otypes=[dict] + self._cross_chunk_edges_vec = np.vectorize( + self.cross_chunk_edges, otypes=[dict] ) # no limit because we don't want to lose new IDs self.parents_cache = LRUCache(maxsize=maxsize) self.children_cache = LRUCache(maxsize=maxsize) - self.atomic_cx_edges_cache = LRUCache(maxsize=maxsize) + self.cross_chunk_edges_cache = LRUCache(maxsize=maxsize) def __len__(self): return ( len(self.parents_cache) + len(self.children_cache) - + len(self.atomic_cx_edges_cache) + + len(self.cross_chunk_edges_cache) ) def clear(self): self.parents_cache.clear() self.children_cache.clear() - self.atomic_cx_edges_cache.clear() + self.cross_chunk_edges_cache.clear() def parent(self, node_id: np.uint64, *, time_stamp: datetime = None): @cached(cache=self.parents_cache, key=lambda node_id: node_id) @@ -67,17 +68,18 @@ def children_decorated(node_id): return children_decorated(node_id) - def atomic_cross_edges(self, node_id): - @cached(cache=self.atomic_cx_edges_cache, key=lambda node_id: node_id) - def atomic_cross_edges_decorated(node_id): - edges = self._cg.get_atomic_cross_edges( - np.array([node_id], dtype=NODE_ID), raw_only=True + def cross_chunk_edges(self, node_id, *, time_stamp: datetime = None): + @cached(cache=self.cross_chunk_edges_cache, key=lambda node_id: node_id) + def cross_edges_decorated(node_id): + edges = self._cg.get_cross_chunk_edges( + np.array([node_id], dtype=NODE_ID), raw_only=True, time_stamp=time_stamp ) return edges[node_id] - return atomic_cross_edges_decorated(node_id) + return cross_edges_decorated(node_id) def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None): + node_ids = np.array(node_ids, dtype=NODE_ID, copy=False) if not node_ids.size: return node_ids mask = np.in1d(node_ids, np.fromiter(self.parents_cache.keys(), dtype=NODE_ID)) @@ -91,6 +93,7 @@ def parents_multiple(self, node_ids: np.ndarray, *, time_stamp: datetime = None) def children_multiple(self, node_ids: np.ndarray, *, flatten=False): result = {} + node_ids = np.array(node_ids, dtype=NODE_ID, copy=False) if not node_ids.size: return result mask = np.in1d(node_ids, np.fromiter(self.children_cache.keys(), dtype=NODE_ID)) @@ -104,20 +107,27 @@ def children_multiple(self, node_ids: np.ndarray, *, flatten=False): return np.concatenate([*result.values()]) return result - def atomic_cross_edges_multiple(self, node_ids: np.ndarray): + def cross_chunk_edges_multiple( + self, node_ids: np.ndarray, *, time_stamp: datetime = None + ): result = {} + node_ids = np.array(node_ids, dtype=NODE_ID, copy=False) if not node_ids.size: return result mask = np.in1d( - node_ids, np.fromiter(self.atomic_cx_edges_cache.keys(), dtype=NODE_ID) + node_ids, np.fromiter(self.cross_chunk_edges_cache.keys(), dtype=NODE_ID) ) - cached_edges_ = self._atomic_cross_edges_vec(node_ids[mask]) + cached_edges_ = self._cross_chunk_edges_vec(node_ids[mask]) result.update( {id_: edges_ for id_, edges_ in zip(node_ids[mask], cached_edges_)} ) - result.update(self._cg.get_atomic_cross_edges(node_ids[~mask], raw_only=True)) + result.update( + self._cg.get_cross_chunk_edges( + node_ids[~mask], raw_only=True, time_stamp=time_stamp + ) + ) update( - self.atomic_cx_edges_cache, + self.cross_chunk_edges_cache, node_ids[~mask], [result[k] for k in node_ids[~mask]], ) diff --git a/pychunkedgraph/graph/chunkedgraph.py b/pychunkedgraph/graph/chunkedgraph.py index 210bff50b..7823695db 100644 --- a/pychunkedgraph/graph/chunkedgraph.py +++ b/pychunkedgraph/graph/chunkedgraph.py @@ -1,8 +1,10 @@ -# pylint: disable=invalid-name, missing-docstring, too-many-lines, import-outside-toplevel +# pylint: disable=invalid-name, missing-docstring, too-many-lines, import-outside-toplevel, unsupported-binary-operation import time import typing import datetime +from itertools import chain +from functools import reduce import numpy as np from pychunkedgraph import __version__ @@ -19,11 +21,14 @@ from .meta import ChunkedGraphMeta from .utils import basetypes from .utils import id_helpers +from .utils import serializers from .utils import generic as misc_utils from .edges import Edges from .edges import utils as edge_utils from .chunks import utils as chunk_utils from .chunks import hierarchy as chunk_hierarchy +from .subgraph import get_subgraph_nodes +from .subgraph import get_subgraph_edges_and_leaves class ChunkedGraph: @@ -74,7 +79,7 @@ def version(self) -> str: return self.client.read_graph_version() @property - def client(self) -> base.SimpleClient: + def client(self) -> BigTableClient: return self._client @property @@ -112,13 +117,15 @@ def range_read_chunk( """Read all nodes in a chunk.""" layer = self.get_chunk_layer(chunk_id) root_chunk = layer == self.meta.layer_count - max_node_id = self.id_client.get_max_node_id(chunk_id=chunk_id, root_chunk=root_chunk) + max_id = self.id_client.get_max_node_id( + chunk_id=chunk_id, root_chunk=root_chunk + ) if layer == 1: - max_node_id = chunk_id | self.get_segment_id_limit(chunk_id) # pylint: disable=unsupported-binary-operation + max_id = chunk_id | self.get_segment_id_limit(chunk_id) return self.client.read_nodes( start_id=self.get_node_id(np.uint64(0), chunk_id=chunk_id), - end_id=max_node_id, + end_id=max_id, end_id_inclusive=True, properties=properties, end_time=time_stamp, @@ -283,97 +290,79 @@ def _get_children_multiple( node_ids=node_ids, properties=attributes.Hierarchy.Child ) return { - x: node_children_d[x][0].value - if x in node_children_d - else types.empty_1d.copy() + x: ( + node_children_d[x][0].value + if x in node_children_d + else types.empty_1d.copy() + ) for x in node_ids } return self.cache.children_multiple(node_ids) - def get_atomic_cross_edges( - self, l2_ids: typing.Iterable, *, raw_only=False - ) -> typing.Dict[np.uint64, typing.Dict[int, typing.Iterable]]: - """Returns cross edges for level 2 IDs.""" + def get_atomic_cross_edges(self, l2_ids: typing.Iterable) -> typing.Dict: + """ + Returns atomic cross edges for level 2 IDs. + A dict of the form `{l2id: {layer: atomic_cross_edges}}`. + """ + node_edges_d_d = self.client.read_nodes( + node_ids=l2_ids, + properties=[ + attributes.Connectivity.AtomicCrossChunkEdge[l] + for l in range(2, max(3, self.meta.layer_count)) + ], + ) + result = {} + for id_ in l2_ids: + try: + result[id_] = { + prop.index: val[0].value.copy() + for prop, val in node_edges_d_d[id_].items() + } + except KeyError: + result[id_] = {} + return result + + def get_cross_chunk_edges( + self, + node_ids: typing.Iterable, + *, + raw_only=False, + all_layers=True, + time_stamp: typing.Optional[datetime.datetime] = None, + ) -> typing.Dict: + """ + Returns cross edges for `node_ids`. + A dict of the form `{node_id: {layer: cross_edges}}`. + """ + time_stamp = misc_utils.get_valid_timestamp(time_stamp) if raw_only or not self.cache: + result = {} + node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) + if node_ids.size == 0: + return result + layers = range(2, max(3, self.meta.layer_count)) + attrs = [attributes.Connectivity.CrossChunkEdge[l] for l in layers] node_edges_d_d = self.client.read_nodes( - node_ids=l2_ids, - properties=[ - attributes.Connectivity.CrossChunkEdge[l] - for l in range(2, max(3, self.meta.layer_count)) - ], + node_ids=node_ids, + properties=attrs, + end_time=time_stamp, + end_time_inclusive=True, ) - result = {} - for id_ in l2_ids: + layers = self.get_chunk_layers(node_ids) + valid_layer = lambda x, y: x >= y + if not all_layers: + valid_layer = lambda x, y: x == y + for layer, id_ in zip(layers, node_ids): try: result[id_] = { prop.index: val[0].value.copy() for prop, val in node_edges_d_d[id_].items() + if valid_layer(prop.index, layer) } except KeyError: result[id_] = {} return result - return self.cache.atomic_cross_edges_multiple(l2_ids) - - def get_cross_chunk_edges( - self, node_ids: typing.Iterable, uplift=True, all_layers=False - ) -> typing.Dict[np.uint64, typing.Dict[int, typing.Iterable]]: - """ - Cross chunk edges for `node_id` at `node_layer`. - The edges are between node IDs at the `node_layer`, not atomic cross edges. - Returns dict {layer_id: cross_edges} - The first layer (>= `node_layer`) with atleast one cross chunk edge. - For current use-cases, other layers are not relevant. - - For performance, only children that lie along chunk boundary are considered. - Cross edges that belong to inner level 2 IDs are subsumed within the chunk. - This is because cross edges are stored only in level 2 IDs. - """ - result = {} - node_ids = np.array(node_ids, dtype=basetypes.NODE_ID) - if not node_ids.size: - return result - - node_l2ids_d = {} - layers_ = self.get_chunk_layers(node_ids) - for l in set(layers_): - node_l2ids_d.update(self._get_bounding_l2_children(node_ids[layers_ == l])) - l2_edges_d_d = self.get_atomic_cross_edges( - np.concatenate(list(node_l2ids_d.values())) - ) - for node_id in node_ids: - l2_edges_ds = [l2_edges_d_d[l2_id] for l2_id in node_l2ids_d[node_id]] - if all_layers: - result[node_id] = edge_utils.concatenate_cross_edge_dicts(l2_edges_ds) - else: - result[node_id] = self._get_min_layer_cross_edges( - node_id, l2_edges_ds, uplift=uplift - ) - return result - - def _get_min_layer_cross_edges( - self, - node_id: basetypes.NODE_ID, - l2id_atomic_cross_edges_ds: typing.Iterable, - uplift=True, - ) -> typing.Dict[int, typing.Iterable]: - """ - Find edges at relevant min_layer >= node_layer. - `l2id_atomic_cross_edges_ds` is a list of atomic cross edges of - level 2 IDs that are descendants of `node_id`. - """ - min_layer, edges = edge_utils.filter_min_layer_cross_edges_multiple( - self.meta, l2id_atomic_cross_edges_ds, self.get_chunk_layer(node_id) - ) - if self.get_chunk_layer(node_id) < min_layer: - # cross edges irrelevant - return {self.get_chunk_layer(node_id): types.empty_2d} - if not uplift: - return {min_layer: edges} - node_root_id = node_id - node_root_id = self.get_root(node_id, stop_layer=min_layer, ceil=False) - edges[:, 0] = node_root_id - edges[:, 1] = self.get_roots(edges[:, 1], stop_layer=min_layer, ceil=False) - return {min_layer: np.unique(edges, axis=0) if edges.size else types.empty_2d} + return self.cache.cross_chunk_edges_multiple(node_ids, time_stamp=time_stamp) def get_roots( self, @@ -384,6 +373,7 @@ def get_roots( stop_layer: int = None, ceil: bool = True, fail_to_zero: bool = False, + raw_only=False, n_tries: int = 1, ) -> typing.Union[np.ndarray, typing.Dict[int, np.ndarray]]: """ @@ -407,7 +397,10 @@ def get_roots( filtered_ids = parent_ids[layer_mask] unique_ids, inverse = np.unique(filtered_ids, return_inverse=True) temp_ids = self.get_parents( - unique_ids, time_stamp=time_stamp, fail_to_zero=fail_to_zero + unique_ids, + time_stamp=time_stamp, + fail_to_zero=fail_to_zero, + raw_only=raw_only, ) if not temp_ids.size: break @@ -462,6 +455,7 @@ def get_root( get_all_parents: bool = False, stop_layer: int = None, ceil: bool = True, + raw_only: bool = False, n_tries: int = 1, ) -> typing.Union[typing.List[np.uint64], np.uint64]: """Takes a node id and returns the associated agglomeration ids.""" @@ -479,7 +473,9 @@ def get_root( for _ in range(n_tries): parent_id = node_id for _ in range(self.get_chunk_layer(node_id), int(stop_layer + 1)): - temp_parent_id = self.get_parent(parent_id, time_stamp=time_stamp) + temp_parent_id = self.get_parent( + parent_id, time_stamp=time_stamp, raw_only=raw_only + ) if temp_parent_id is None: break else: @@ -551,17 +547,18 @@ def get_subgraph( node_id_or_ids: typing.Union[np.uint64, typing.Iterable], bbox: typing.Optional[typing.Sequence[typing.Sequence[int]]] = None, bbox_is_coordinate: bool = False, - return_layers: typing.List = [2], + return_layers: typing.List = None, nodes_only: bool = False, edges_only: bool = False, leaves_only: bool = False, return_flattened: bool = False, - ) -> typing.Tuple[typing.Dict, typing.Dict, Edges]: + ) -> typing.Tuple[typing.Dict, typing.Tuple[Edges]]: """ Generic subgraph method. """ - from .subgraph import get_subgraph_nodes - from .subgraph import get_subgraph_edges_and_leaves + + if return_layers is None: + return_layers = [2] if nodes_only: return get_subgraph_nodes( @@ -581,7 +578,7 @@ def get_subgraph_nodes( node_id_or_ids: typing.Union[np.uint64, typing.Iterable], bbox: typing.Optional[typing.Sequence[typing.Sequence[int]]] = None, bbox_is_coordinate: bool = False, - return_layers: typing.List = [2], + return_layers: typing.List = None, serializable: bool = False, return_flattened: bool = False, ) -> typing.Tuple[typing.Dict, typing.Dict, Edges]: @@ -589,7 +586,8 @@ def get_subgraph_nodes( Get the children of `node_ids` that are at each of return_layers within the specified bounding box. """ - from .subgraph import get_subgraph_nodes + if return_layers is None: + return_layers = [2] return get_subgraph_nodes( self, @@ -610,8 +608,6 @@ def get_subgraph_edges( """ Get the atomic edges of the `node_ids` within the specified bounding box. """ - from .subgraph import get_subgraph_edges_and_leaves - return get_subgraph_edges_and_leaves( self, node_id_or_ids, bbox, bbox_is_coordinate, True, False ) @@ -625,8 +621,6 @@ def get_subgraph_leaves( """ Get the supervoxels of the `node_ids` within the specified bounding box. """ - from .subgraph import get_subgraph_edges_and_leaves - return get_subgraph_edges_and_leaves( self, node_id_or_ids, bbox, bbox_is_coordinate, False, True ) @@ -644,20 +638,37 @@ def get_fake_edges( ) for id_, val in fake_edges_d.items(): edges = np.concatenate( - [np.array(e.value, dtype=basetypes.NODE_ID) for e in val] + [np.array(e.value, dtype=basetypes.NODE_ID, copy=False) for e in val] ) - result[id_] = Edges(edges[:, 0], edges[:, 1], fake_edges=True) + result[id_] = Edges(edges[:, 0], edges[:, 1]) return result + def copy_fake_edges(self, chunk_id: np.uint64) -> None: + _edges = self.client.read_node( + node_id=chunk_id, + properties=attributes.Connectivity.FakeEdgesCF3, + end_time_inclusive=True, + fake_edges=True, + ) + mutations = [] + _id = serializers.serialize_uint64(chunk_id, fake_edges=True) + for e in _edges: + val_dict = {attributes.Connectivity.FakeEdges: e.value} + row = self.client.mutate_row(_id, val_dict, time_stamp=e.timestamp) + mutations.append(row) + self.client.write(mutations) + def get_l2_agglomerations( - self, level2_ids: np.ndarray, edges_only: bool = False - ) -> typing.Tuple[typing.Dict[int, types.Agglomeration], np.ndarray]: + self, + level2_ids: np.ndarray, + edges_only: bool = False, + active: bool = False, + time_stamp: typing.Optional[datetime.datetime] = None, + ) -> typing.Tuple[typing.Dict[int, types.Agglomeration], typing.Tuple[Edges]]: """ Children of Level 2 Node IDs and edges. Edges are read from cloud storage. """ - from itertools import chain - from functools import reduce from .misc import get_agglomerations chunk_ids = np.unique(self.get_chunk_ids_from_node_ids(level2_ids)) @@ -674,6 +685,8 @@ def get_l2_agglomerations( chain(edges_d.values(), fake_edges.values()), Edges([], []), ) + if self.mock_edges is not None: + all_chunk_edges += self.mock_edges if edges_only: if self.mock_edges is not None: @@ -689,12 +702,18 @@ def get_l2_agglomerations( sv_parent_d = {} for l2id in l2id_children_d: svs = l2id_children_d[l2id] + for sv in svs: + if sv in sv_parent_d: + raise ValueError("Found conflicting parents.") sv_parent_d.update(dict(zip(svs.tolist(), [l2id] * len(svs)))) + if active: + all_chunk_edges = edge_utils.filter_inactive_cross_edges( + self, all_chunk_edges, time_stamp=time_stamp + ) + in_edges, out_edges, cross_edges = edge_utils.categorize_edges_v2( - self.meta, - all_chunk_edges, - sv_parent_d + self.meta, all_chunk_edges, sv_parent_d ) agglomeration_d = get_agglomerations( @@ -702,13 +721,15 @@ def get_l2_agglomerations( ) return ( agglomeration_d, - (self.mock_edges,) - if self.mock_edges is not None - else (in_edges, out_edges, cross_edges), + ( + (self.mock_edges,) + if self.mock_edges is not None + else (in_edges, out_edges, cross_edges) + ), ) def get_node_timestamps( - self, node_ids: typing.Sequence[np.uint64], return_numpy=True + self, node_ids: typing.Sequence[np.uint64], return_numpy=True, normalize=False ) -> typing.Iterable: """ The timestamp of the children column can be assumed @@ -722,17 +743,22 @@ def get_node_timestamps( if return_numpy: return np.array([], dtype=np.datetime64) return [] + result = [] + earliest_ts = self.get_earliest_timestamp() + for n in node_ids: + ts = children[n][0].timestamp + if normalize: + ts = earliest_ts if ts < earliest_ts else ts + result.append(ts) if return_numpy: - return np.array( - [children[x][0].timestamp for x in node_ids], dtype=np.datetime64 - ) - return [children[x][0].timestamp for x in node_ids] + return np.array(result, dtype=np.datetime64) + return result # OPERATIONS def add_edges( self, user_id: str, - atomic_edges: typing.Sequence[np.uint64], + atomic_edges: typing.Sequence[typing.Sequence[np.uint64]], *, affinities: typing.Sequence[np.float32] = None, source_coords: typing.Sequence[int] = None, @@ -842,82 +868,7 @@ def redo_operation( multicut_as_split=True, ).execute() - # PRIVATE - - def _get_bounding_chunk_ids( - self, - parent_chunk_ids: typing.Iterable, - unique: bool = False, - ) -> typing.Dict: - """ - Returns bounding chunk IDs at layers < parent_layer for all chunk IDs. - Dict[parent_chunk_id] = np.array(bounding_chunk_ids) - """ - parent_chunk_coords = self.get_chunk_coordinates_multiple(parent_chunk_ids) - parents_layer = self.get_chunk_layer(parent_chunk_ids[0]) - chunk_id_bchunk_ids_d = {} - for i, chunk_id in enumerate(parent_chunk_ids): - if chunk_id in chunk_id_bchunk_ids_d: - # `parent_chunk_ids` can have duplicates - # avoid redundant calculations - continue - parent_coord = parent_chunk_coords[i] - chunk_ids = [types.empty_1d] - for child_layer in range(2, parents_layer): - bcoords = chunk_utils.get_bounding_children_chunks( - self.meta, - parents_layer, - parent_coord, - child_layer, - return_unique=False, - ) - bchunks_ids = chunk_utils.get_chunk_ids_from_coords( - self.meta, child_layer, bcoords - ) - chunk_ids.append(bchunks_ids) - chunk_ids = np.concatenate(chunk_ids) - if unique: - chunk_ids = np.unique(chunk_ids) - chunk_id_bchunk_ids_d[chunk_id] = chunk_ids - return chunk_id_bchunk_ids_d - - def _get_bounding_l2_children(self, parents: typing.Iterable) -> typing.Dict: - parent_chunk_ids = self.get_chunk_ids_from_node_ids(parents) - chunk_id_bchunk_ids_d = self._get_bounding_chunk_ids( - parent_chunk_ids, unique=len(parents) >= 200 - ) - - parent_descendants_d = { - _id: np.array([_id], dtype=basetypes.NODE_ID) for _id in parents - } - descendants_all = np.concatenate(list(parent_descendants_d.values())) - descendants_layers = self.get_chunk_layers(descendants_all) - layer_mask = descendants_layers > 2 - descendants_all = descendants_all[layer_mask] - - while descendants_all.size: - descendant_children_d = self.get_children(descendants_all) - for i, parent_id in enumerate(parents): - _descendants = parent_descendants_d[parent_id] - _layers = self.get_chunk_layers(_descendants) - _l2mask = _layers == 2 - descendants = [_descendants[_l2mask]] - for child in _descendants[~_l2mask]: - descendants.append(descendant_children_d[child]) - descendants = np.concatenate(descendants) - chunk_ids = self.get_chunk_ids_from_node_ids(descendants) - bchunk_ids = chunk_id_bchunk_ids_d[parent_chunk_ids[i]] - bounding_descendants = descendants[np.in1d(chunk_ids, bchunk_ids)] - parent_descendants_d[parent_id] = bounding_descendants - - descendants_all = np.concatenate(list(parent_descendants_d.values())) - descendants_layers = self.get_chunk_layers(descendants_all) - layer_mask = descendants_layers > 2 - descendants_all = descendants_all[layer_mask] - return parent_descendants_d - # HELPERS / WRAPPERS - def is_root(self, node_id: basetypes.NODE_ID) -> bool: return self.get_chunk_layer(node_id) == self.meta.layer_count @@ -955,7 +906,9 @@ def get_chunk_coordinates(self, node_or_chunk_id: basetypes.NODE_ID): return chunk_utils.get_chunk_coordinates(self.meta, node_or_chunk_id) def get_chunk_coordinates_multiple(self, node_or_chunk_ids: typing.Sequence): - node_or_chunk_ids = np.array(node_or_chunk_ids, dtype=basetypes.NODE_ID) + node_or_chunk_ids = np.array( + node_or_chunk_ids, dtype=basetypes.NODE_ID, copy=False + ) layers = self.get_chunk_layers(node_or_chunk_ids) assert np.all(layers == layers[0]), "All IDs must have the same layer." return chunk_utils.get_chunk_coordinates_multiple(self.meta, node_or_chunk_ids) @@ -1020,3 +973,14 @@ def get_earliest_timestamp(self): _, timestamp = self.client.read_log_entry(op_id) if timestamp is not None: return timestamp - timedelta(milliseconds=500) + + def get_operation_ids(self, node_ids: typing.Sequence): + response = self.client.read_nodes(node_ids=node_ids) + result = {} + for node in node_ids: + try: + operations = response[node][attributes.OperationLogs.OperationID] + result[node] = [(x.value, x.timestamp) for x in operations] + except KeyError: + ... + return result diff --git a/pychunkedgraph/graph/chunks/atomic.py b/pychunkedgraph/graph/chunks/atomic.py index e3de065ff..b609f4cfb 100644 --- a/pychunkedgraph/graph/chunks/atomic.py +++ b/pychunkedgraph/graph/chunks/atomic.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring + from typing import List from typing import Sequence from itertools import product @@ -6,8 +8,6 @@ from .utils import get_bounding_children_chunks from ..meta import ChunkedGraphMeta -from ..utils.generic import get_valid_timestamp -from ..utils import basetypes def get_touching_atomic_chunks( @@ -27,7 +27,7 @@ def get_touching_atomic_chunks( chunk_offset = chunk_coords * atomic_chunk_count mid = (atomic_chunk_count // 2) - 1 - # TODO (akhileshh) convert this for loop to numpy + # TODO (akhileshh) convert this for loop to numpy; # relevant chunks along touching planes at center for axis_1, axis_2 in product(*[range(atomic_chunk_count)] * 2): # x-y plane diff --git a/pychunkedgraph/graph/chunks/utils.py b/pychunkedgraph/graph/chunks/utils.py index dc895bde4..4d01258bd 100644 --- a/pychunkedgraph/graph/chunks/utils.py +++ b/pychunkedgraph/graph/chunks/utils.py @@ -8,6 +8,7 @@ import numpy as np + def get_chunks_boundary(voxel_boundary, chunk_size) -> np.ndarray: """returns number of chunks in each dimension""" return np.ceil((voxel_boundary / chunk_size)).astype(int) @@ -43,7 +44,7 @@ def normalize_bounding_box( def get_chunk_layer(meta, node_or_chunk_id: np.uint64) -> int: - """ Extract Layer from Node ID or Chunk ID """ + """Extract Layer from Node ID or Chunk ID""" return int(int(node_or_chunk_id) >> 64 - meta.graph_config.LAYER_ID_BITS) @@ -75,9 +76,9 @@ def get_chunk_coordinates(meta, node_or_chunk_id: np.uint64) -> np.ndarray: y_offset = x_offset - bits_per_dim z_offset = y_offset - bits_per_dim - x = int(node_or_chunk_id) >> x_offset & 2 ** bits_per_dim - 1 - y = int(node_or_chunk_id) >> y_offset & 2 ** bits_per_dim - 1 - z = int(node_or_chunk_id) >> z_offset & 2 ** bits_per_dim - 1 + x = int(node_or_chunk_id) >> x_offset & 2**bits_per_dim - 1 + y = int(node_or_chunk_id) >> y_offset & 2**bits_per_dim - 1 + z = int(node_or_chunk_id) >> z_offset & 2**bits_per_dim - 1 return np.array([x, y, z]) @@ -86,7 +87,7 @@ def get_chunk_coordinates_multiple(meta, ids: np.ndarray) -> np.ndarray: Array version of get_chunk_coordinates. Assumes all given IDs are in same layer. """ - if not len(ids): + if len(ids) == 0: return np.array([]) layer = get_chunk_layer(meta, ids[0]) bits_per_dim = meta.bitmasks[layer] @@ -95,10 +96,10 @@ def get_chunk_coordinates_multiple(meta, ids: np.ndarray) -> np.ndarray: y_offset = x_offset - bits_per_dim z_offset = y_offset - bits_per_dim - ids = np.array(ids, dtype=int) - X = ids >> x_offset & 2 ** bits_per_dim - 1 - Y = ids >> y_offset & 2 ** bits_per_dim - 1 - Z = ids >> z_offset & 2 ** bits_per_dim - 1 + ids = np.array(ids, dtype=int, copy=False) + X = ids >> x_offset & 2**bits_per_dim - 1 + Y = ids >> y_offset & 2**bits_per_dim - 1 + Z = ids >> z_offset & 2**bits_per_dim - 1 return np.column_stack((X, Y, Z)) @@ -142,14 +143,15 @@ def get_chunk_ids_from_coords(meta, layer: int, coords: np.ndarray): def get_chunk_ids_from_node_ids(meta, ids: Iterable[np.uint64]) -> np.ndarray: - """ Extract Chunk IDs from Node IDs""" + """Extract Chunk IDs from Node IDs""" if len(ids) == 0: return np.array([], dtype=np.uint64) bits_per_dims = np.array([meta.bitmasks[l] for l in get_chunk_layers(meta, ids)]) offsets = 64 - meta.graph_config.LAYER_ID_BITS - 3 * bits_per_dims - cids1 = np.array((np.array(ids, dtype=int) >> offsets) << offsets, dtype=np.uint64) + ids = np.array(ids, dtype=int, copy=False) + cids1 = np.array((ids >> offsets) << offsets, dtype=np.uint64) # cids2 = np.vectorize(get_chunk_id)(meta, ids) # assert np.all(cids1 == cids2) return cids1 @@ -164,7 +166,7 @@ def _compute_chunk_id( ) -> np.uint64: s_bits_per_dim = meta.bitmasks[layer] if not ( - x < 2 ** s_bits_per_dim and y < 2 ** s_bits_per_dim and z < 2 ** s_bits_per_dim + x < 2**s_bits_per_dim and y < 2**s_bits_per_dim and z < 2**s_bits_per_dim ): raise ValueError( f"Coordinate is out of range \ diff --git a/pychunkedgraph/graph/client/base.py b/pychunkedgraph/graph/client/base.py index a66602a6a..953734670 100644 --- a/pychunkedgraph/graph/client/base.py +++ b/pychunkedgraph/graph/client/base.py @@ -13,7 +13,7 @@ def create_graph(self) -> None: """Initialize the graph and store associated meta.""" @abstractmethod - def add_graph_version(self, version): + def add_graph_version(self, version: str, overwrite: bool = False): """Add a version to the graph.""" @abstractmethod diff --git a/pychunkedgraph/graph/client/bigtable/client.py b/pychunkedgraph/graph/client/bigtable/client.py index 5b86826bd..9195fb397 100644 --- a/pychunkedgraph/graph/client/bigtable/client.py +++ b/pychunkedgraph/graph/client/bigtable/client.py @@ -1,11 +1,11 @@ -# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel, line-too-long, protected-access, arguments-differ, arguments-renamed, logging-fstring-interpolation +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel, line-too-long, protected-access, arguments-differ, arguments-renamed, logging-fstring-interpolation, too-many-arguments import sys import time import typing import logging -import datetime from datetime import datetime +from datetime import timedelta import numpy as np from multiwrapper import multiprocessing_utils as mu @@ -15,11 +15,12 @@ from google.api_core.exceptions import Aborted from google.api_core.exceptions import DeadlineExceeded from google.api_core.exceptions import ServiceUnavailable +from google.cloud.bigtable.column_family import MaxAgeGCRule +from google.cloud.bigtable.column_family import MaxVersionsGCRule from google.cloud.bigtable.table import Table from google.cloud.bigtable.row_set import RowSet -from google.cloud.bigtable.row_data import PartialRowData +from google.cloud.bigtable.row_data import DEFAULT_RETRY_READ_ROWS, PartialRowData from google.cloud.bigtable.row_filters import RowFilter -from google.cloud.bigtable.column_family import MaxVersionsGCRule from . import utils from . import BigTableConfig @@ -71,6 +72,18 @@ def __init__( self._version = None self._max_row_key_count = config.MAX_ROW_KEY_COUNT + def _create_column_families(self): + f = self._table.column_family("0") + f.create() + f = self._table.column_family("1", gc_rule=MaxVersionsGCRule(1)) + f.create() + f = self._table.column_family("2") + f.create() + f = self._table.column_family("3", gc_rule=MaxAgeGCRule(timedelta(days=365))) + f.create() + f = self._table.column_family("4") + f.create() + @property def graph_meta(self): return self._graph_meta @@ -84,8 +97,9 @@ def create_graph(self, meta: ChunkedGraphMeta, version: str) -> None: self.add_graph_version(version) self.update_graph_meta(meta) - def add_graph_version(self, version: str): - assert self.read_graph_version() is None, "Graph has already been versioned." + def add_graph_version(self, version: str, overwrite: bool = False): + if not overwrite: + assert self.read_graph_version() is None, self.read_graph_version() self._version = version row = self.mutate_row( attributes.GraphVersion.key, @@ -137,6 +151,7 @@ def read_nodes( end_time=None, end_time_inclusive: bool = False, fake_edges: bool = False, + attr_keys: bool = True, ): """ Read nodes and their properties. @@ -147,26 +162,38 @@ def read_nodes( # when all IDs in a block are within a range node_ids = np.sort(node_ids) rows = self._read_byte_rows( - start_key=serialize_uint64(start_id, fake_edges=fake_edges) - if start_id is not None - else None, - end_key=serialize_uint64(end_id, fake_edges=fake_edges) - if end_id is not None - else None, + start_key=( + serialize_uint64(start_id, fake_edges=fake_edges) + if start_id is not None + else None + ), + end_key=( + serialize_uint64(end_id, fake_edges=fake_edges) + if end_id is not None + else None + ), end_key_inclusive=end_id_inclusive, row_keys=( - serialize_uint64(node_id, fake_edges=fake_edges) for node_id in node_ids - ) - if node_ids is not None - else None, + ( + serialize_uint64(node_id, fake_edges=fake_edges) + for node_id in node_ids + ) + if node_ids is not None + else None + ), columns=properties, start_time=start_time, end_time=end_time, end_time_inclusive=end_time_inclusive, user_id=user_id, ) + if attr_keys: + return { + deserialize_uint64(row_key, fake_edges=fake_edges): data + for (row_key, data) in rows.items() + } return { - deserialize_uint64(row_key, fake_edges=fake_edges): data + deserialize_uint64(row_key, fake_edges=fake_edges): {k.key:v for k,v in data.items()} for (row_key, data) in rows.items() } @@ -628,16 +655,6 @@ def get_compatible_timestamp( return utils.get_google_compatible_time_stamp(time_stamp, round_up=round_up) # PRIVATE METHODS - def _create_column_families(self): - f = self._table.column_family("0") - f.create() - f = self._table.column_family("1", gc_rule=MaxVersionsGCRule(1)) - f.create() - f = self._table.column_family("2") - f.create() - f = self._table.column_family("3") - f.create() - def _get_ids_range(self, key: bytes, size: int) -> typing.Tuple: """Returns a range (min, max) of IDs for a given `key`.""" column = attributes.Concurrency.Counter @@ -816,7 +833,8 @@ def _execute_read_thread(self, args: typing.Tuple[Table, RowSet, RowFilter]): # Check for everything falsy, because Bigtable considers even empty # lists of row_keys as no upper/lower bound! return {} - range_read = table.read_rows(row_set=row_set, filter_=row_filter) + retry = DEFAULT_RETRY_READ_ROWS.with_timeout(180) + range_read = table.read_rows(row_set=row_set, filter_=row_filter, retry=retry) res = {v.row_key: utils.partial_row_data_to_column_dict(v) for v in range_read} return res diff --git a/pychunkedgraph/graph/connectivity/search.py b/pychunkedgraph/graph/connectivity/search.py deleted file mode 100644 index bd3faf227..000000000 --- a/pychunkedgraph/graph/connectivity/search.py +++ /dev/null @@ -1,47 +0,0 @@ -import random -from typing import List - -import numpy as np -from graph_tool.search import bfs_search -from graph_tool.search import BFSVisitor -from graph_tool.search import StopSearch - -from ..utils.basetypes import NODE_ID - - -class TargetVisitor(BFSVisitor): - def __init__(self, target, reachable): - self.target = target - self.reachable = reachable - - def discover_vertex(self, u): - if u == self.target: - self.reachable[u] = 1 - raise StopSearch - - -def check_reachability(g, sv1s: np.ndarray, sv2s: np.ndarray, original_ids: np.ndarray) -> np.ndarray: - """ - g: graph tool Graph instance with ids 0 to N-1 where N = vertex count - original_ids: sorted ChunkedGraph supervoxel ids - (to identify corresponding ids in graph tool) - for each pair (sv1, sv2) check if a path exists (BFS) - """ - # mapping from original ids to graph tool ids - original_ids_d = { - sv_id: index for sv_id, index in zip(original_ids, range(len(original_ids))) - } - reachable = g.new_vertex_property("int", val=0) - - def _check_reachability(source, target): - bfs_search(g, source, TargetVisitor(target, reachable)) - return reachable[target] - - return np.array( - [ - _check_reachability(original_ids_d[source], original_ids_d[target]) - for source, target in zip(sv1s, sv2s) - ], - dtype=bool, - ) - diff --git a/pychunkedgraph/graph/edges/__init__.py b/pychunkedgraph/graph/edges/__init__.py index b0e488d05..430ab9fa7 100644 --- a/pychunkedgraph/graph/edges/__init__.py +++ b/pychunkedgraph/graph/edges/__init__.py @@ -2,10 +2,14 @@ Classes and types for edges """ -from typing import Optional from collections import namedtuple +from os import environ +from typing import Optional import numpy as np +import tensorstore as ts +import zstandard as zstd +from graph_tool import Graph from ..utils import basetypes @@ -18,6 +22,14 @@ DEFAULT_AFFINITY = np.finfo(np.float32).tiny DEFAULT_AREA = np.finfo(np.float32).tiny +ADJACENCY_DTYPE = np.dtype( + [ + ("node", basetypes.NODE_ID), + ("aff", basetypes.EDGE_AFFINITY), + ("area", basetypes.EDGE_AREA), + ] +) +ZSTD_EDGE_COMPRESSION = 17 class Edges: @@ -28,17 +40,17 @@ def __init__( *, affinities: Optional[np.ndarray] = None, areas: Optional[np.ndarray] = None, - fake_edges=False, ): self.node_ids1 = np.array(node_ids1, dtype=basetypes.NODE_ID, copy=False) self.node_ids2 = np.array(node_ids2, dtype=basetypes.NODE_ID, copy=False) assert self.node_ids1.size == self.node_ids2.size self._as_pairs = None - self._fake_edges = fake_edges if affinities is not None and len(affinities) > 0: - self._affinities = np.array(affinities, dtype=basetypes.EDGE_AFFINITY, copy=False) + self._affinities = np.array( + affinities, dtype=basetypes.EDGE_AFFINITY, copy=False + ) assert self.node_ids1.size == self._affinities.size else: self._affinities = np.full(len(self.node_ids1), DEFAULT_AFFINITY) @@ -103,3 +115,77 @@ def get_pairs(self) -> np.ndarray: return self._as_pairs self._as_pairs = np.column_stack((self.node_ids1, self.node_ids2)) return self._as_pairs + + +def put_edges(destination: str, nodes: np.ndarray, edges: Edges) -> None: + graph_ids, _edges = np.unique(edges.get_pairs(), return_inverse=True) + graph_ids_reverse = {n: i for i, n in enumerate(graph_ids)} + _edges = _edges.reshape(-1, 2) + + graph = Graph(directed=False) + graph.add_edge_list(_edges) + e_aff = graph.new_edge_property("double", vals=edges.affinities) + e_area = graph.new_edge_property("int", vals=edges.areas) + cctx = zstd.ZstdCompressor(level=ZSTD_EDGE_COMPRESSION) + ocdbt_host = environ["OCDBT_COORDINATOR_HOST"] + ocdbt_port = environ["OCDBT_COORDINATOR_PORT"] + + spec = { + "driver": "ocdbt", + "base": destination, + "coordinator": {"address": f"{ocdbt_host}:{ocdbt_port}"}, + } + dataset = ts.KvStore.open(spec).result() + with ts.Transaction() as txn: + for _node in nodes: + node = graph_ids_reverse[_node] + neighbors = graph.get_all_neighbors(node) + adjacency_list = np.zeros(neighbors.size, dtype=ADJACENCY_DTYPE) + adjacency_list["node"] = graph_ids[neighbors] + adjacency_list["aff"] = [e_aff[(node, neighbor)] for neighbor in neighbors] + adjacency_list["area"] = [ + e_area[(node, neighbor)] for neighbor in neighbors + ] + dataset.with_transaction(txn)[str(graph_ids[node])] = cctx.compress( + adjacency_list.tobytes() + ) + + +def get_edges(source: str, nodes: np.ndarray) -> Edges: + spec = {"driver": "ocdbt", "base": source} + dataset = ts.KvStore.open(spec).result() + zdc = zstd.ZstdDecompressor() + + read_futures = [dataset.read(str(n)) for n in nodes] + read_results = [rf.result() for rf in read_futures] + compressed = [rr.value for rr in read_results] + + try: + n_threads = int(environ.get("ZSTD_THREADS", 1)) + except ValueError: + n_threads = 1 + + decompressed = [] + try: + decompressed = zdc.multi_decompress_to_buffer(compressed, threads=n_threads) + except ValueError: + for content in compressed: + decompressed.append(zdc.decompressobj().decompress(content)) + + node_ids1 = [np.empty(0, dtype=basetypes.NODE_ID)] + node_ids2 = [np.empty(0, dtype=basetypes.NODE_ID)] + affinities = [np.empty(0, dtype=basetypes.EDGE_AFFINITY)] + areas = [np.empty(0, dtype=basetypes.EDGE_AREA)] + for n, content in zip(nodes, compressed): + adjacency_list = np.frombuffer(content, dtype=ADJACENCY_DTYPE) + node_ids1.append([n] * adjacency_list.size) + node_ids2.append(adjacency_list["node"]) + affinities.append(adjacency_list["aff"]) + areas.append(adjacency_list["area"]) + + return Edges( + np.concatenate(node_ids1), + np.concatenate(node_ids2), + affinities=np.concatenate(affinities), + areas=np.concatenate(areas), + ) diff --git a/pychunkedgraph/graph/edges/utils.py b/pychunkedgraph/graph/edges/utils.py index 034ca6ebc..76f8ea1d8 100644 --- a/pychunkedgraph/graph/edges/utils.py +++ b/pychunkedgraph/graph/edges/utils.py @@ -8,16 +8,18 @@ from typing import Tuple from typing import Iterable from typing import Optional +from collections import defaultdict +from functools import reduce import fastremap import numpy as np from . import Edges from . import EDGE_TYPES -from ..types import empty_2d from ..utils import basetypes from ..chunks import utils as chunk_utils from ..meta import ChunkedGraphMeta +from ...utils.general import in2d def concatenate_chunk_edges(chunk_edge_dicts: Iterable) -> Dict: @@ -45,18 +47,20 @@ def concatenate_chunk_edges(chunk_edge_dicts: Iterable) -> Dict: return edges_dict -def concatenate_cross_edge_dicts(edges_ds: Iterable[Dict]) -> Dict: +def concatenate_cross_edge_dicts( + edges_ds: Iterable[Dict], unique: bool = False +) -> Dict: """Combines cross chunk edge dicts of form {layer id : edge list}.""" - from collections import defaultdict - result_d = defaultdict(list) - for edges_d in edges_ds: for layer, edges in edges_d.items(): result_d[layer].append(edges) for layer, edge_lists in result_d.items(): - result_d[layer] = np.concatenate(edge_lists) + edges = np.concatenate(edge_lists) + if unique: + edges = np.unique(edges, axis=0) + result_d[layer] = edges return result_d @@ -152,40 +156,7 @@ def get_cross_chunk_edges_layer(meta: ChunkedGraphMeta, cross_edges: Iterable): return cross_chunk_edge_layers -def filter_min_layer_cross_edges( - meta: ChunkedGraphMeta, cross_edges_d: Dict, node_layer: int = 2 -) -> Tuple[int, Iterable]: - """ - Given a dict of cross chunk edges {layer: edges} - Return the first layer with cross edges. - """ - for layer in range(node_layer, meta.layer_count): - edges_ = cross_edges_d.get(layer, empty_2d) - if edges_.size: - return (layer, edges_) - return (meta.layer_count, edges_) - - -def filter_min_layer_cross_edges_multiple( - meta: ChunkedGraphMeta, l2id_atomic_cross_edges_ds: Iterable, node_layer: int = 2 -) -> Tuple[int, Iterable]: - """ - Given a list of dicts of cross chunk edges [{layer: edges}] - Return the first layer with cross edges. - """ - min_layer = meta.layer_count - for edges_d in l2id_atomic_cross_edges_ds: - layer_, _ = filter_min_layer_cross_edges(meta, edges_d, node_layer=node_layer) - min_layer = min(min_layer, layer_) - edges = [empty_2d] - for edges_d in l2id_atomic_cross_edges_ds: - edges.append(edges_d.get(min_layer, empty_2d)) - return min_layer, np.concatenate(edges) - - def get_edges_status(cg, edges: Iterable, time_stamp: Optional[float] = None): - from ...utils.general import in2d - coords0 = chunk_utils.get_chunk_coordinates_multiple(cg.meta, edges[:, 0]) coords1 = chunk_utils.get_chunk_coordinates_multiple(cg.meta, edges[:, 1]) @@ -214,3 +185,20 @@ def get_edges_status(cg, edges: Iterable, time_stamp: Optional[float] = None): active_status.extend(mask) active_status = np.array(active_status, dtype=bool) return existence_status, active_status + + +def filter_inactive_cross_edges( + cg, all_chunk_edges: Edges, time_stamp: Optional[float] = None +): + result = [] + layers = cg.get_cross_chunk_edges_layer(all_chunk_edges.get_pairs()) + for layer in np.unique(layers): + layer_mask = layers == layer + parent_layer = layer + 1 + layer_edges = all_chunk_edges[layer_mask] + n1, n2 = layer_edges.node_ids1, layer_edges.node_ids2 + parents1 = cg.get_roots(n1, stop_layer=parent_layer, time_stamp=time_stamp) + parents2 = cg.get_roots(n2, stop_layer=parent_layer, time_stamp=time_stamp) + mask = parents1 == parents2 + result.append(layer_edges[mask]) + return reduce(lambda x, y: x + y, result, Edges([], [])) diff --git a/pychunkedgraph/graph/edits.py b/pychunkedgraph/graph/edits.py index be2eee1c6..afe1b3abf 100644 --- a/pychunkedgraph/graph/edits.py +++ b/pychunkedgraph/graph/edits.py @@ -5,8 +5,10 @@ from typing import List from typing import Tuple from typing import Iterable +from typing import Set from collections import defaultdict +import fastremap import numpy as np import fastremap @@ -20,27 +22,26 @@ from .utils.serializers import serialize_uint64 from ..logging.log_db import TimeIt from ..utils.general import in2d +from ..debug.utils import sanity_check, sanity_check_single def _init_old_hierarchy(cg, l2ids: np.ndarray, parent_ts: datetime.datetime = None): - new_old_id_d = defaultdict(set) - old_new_id_d = defaultdict(set) old_hierarchy_d = {id_: {2: id_} for id_ in l2ids} for id_ in l2ids: layer_parent_d = cg.get_all_parents_dict(id_, time_stamp=parent_ts) old_hierarchy_d[id_].update(layer_parent_d) for parent in layer_parent_d.values(): old_hierarchy_d[parent] = old_hierarchy_d[id_] - return new_old_id_d, old_new_id_d, old_hierarchy_d + return old_hierarchy_d def _analyze_affected_edges( cg, atomic_edges: Iterable[np.ndarray], parent_ts: datetime.datetime = None ) -> Tuple[Iterable, Dict]: """ - Determine if atomic edges are within the chunk. - If not, they are cross edges between two L2 IDs in adjacent chunks. - Returns edges between L2 IDs and atomic cross edges. + Returns l2 edges within chunk and self edges for nodes in cross chunk edges. + + Also returns new cross edges dicts for nodes crossing chunk boundary. """ supervoxels = np.unique(atomic_edges) parents = cg.get_parents(supervoxels, time_stamp=parent_ts) @@ -51,19 +52,18 @@ def _analyze_affected_edges( for edge_ in atomic_edges[edge_layers == 1] ] - # cross chunk edges - atomic_cross_edges_d = defaultdict(lambda: defaultdict(list)) + cross_edges_d = defaultdict(lambda: defaultdict(list)) for layer in range(2, cg.meta.layer_count): layer_edges = atomic_edges[edge_layers == layer] if not layer_edges.size: continue for edge in layer_edges: - parent_1 = sv_parent_d[edge[0]] - parent_2 = sv_parent_d[edge[1]] - atomic_cross_edges_d[parent_1][layer].append(edge) - atomic_cross_edges_d[parent_2][layer].append(edge[::-1]) - parent_edges.extend([[parent_1, parent_1], [parent_2, parent_2]]) - return (parent_edges, atomic_cross_edges_d) + parent0 = sv_parent_d[edge[0]] + parent1 = sv_parent_d[edge[1]] + cross_edges_d[parent0][layer].append([parent0, parent1]) + cross_edges_d[parent1][layer].append([parent1, parent0]) + parent_edges.extend([[parent0, parent0], [parent1, parent1]]) + return parent_edges, cross_edges_d def _get_relevant_components(edges: np.ndarray, supervoxels: np.ndarray) -> Tuple: @@ -89,9 +89,7 @@ def merge_preprocess( parent_ts: datetime.datetime = None, ) -> np.ndarray: """ - Determine if a fake edge needs to be added. - Get subgraph within the bounding box - Add fake edge if there are no inactive edges between two components. + Check and return inactive edges in the subgraph. """ edge_layers = cg.get_cross_chunk_edges_layer(subgraph_edges) active_edges = [types.empty_2d] @@ -142,11 +140,11 @@ def check_fake_edges( ) ) assert len(roots) == 2, "edges must be from 2 roots" - print("found inactive", len(inactive_edges)) return inactive_edges, [] rows = [] supervoxels = atomic_edges.ravel() + # fake edges are stored with l2 chunks chunk_ids = cg.get_chunk_ids_from_node_ids( cg.get_parents(supervoxels, time_stamp=parent_ts) ) @@ -177,7 +175,6 @@ def check_fake_edges( time_stamp=time_stamp, ) ) - print("no inactive", len(atomic_edges)) return atomic_edges, rows @@ -190,21 +187,20 @@ def add_edges( parent_ts: datetime.datetime = None, allow_same_segment_merge=False, ): - edges, l2_atomic_cross_edges_d = _analyze_affected_edges( + edges, l2_cross_edges_d = _analyze_affected_edges( cg, atomic_edges, parent_ts=parent_ts ) l2ids = np.unique(edges) if not allow_same_segment_merge: - assert ( - np.unique(cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts)).size - == 2 - ), "L2 IDs must belong to different roots." - new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( - cg, l2ids, parent_ts=parent_ts - ) + roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) + assert np.unique(roots).size == 2, "L2 IDs must belong to different roots." + + new_old_id_d = defaultdict(set) + old_new_id_d = defaultdict(set) + old_hierarchy_d = _init_old_hierarchy(cg, l2ids, parent_ts=parent_ts) atomic_children_d = cg.get_children(l2ids) - atomic_cross_edges_d = merge_cross_edge_dicts( - cg.get_atomic_cross_edges(l2ids), l2_atomic_cross_edges_d + cross_edges_d = merge_cross_edge_dicts( + cg.get_cross_chunk_edges(l2ids, time_stamp=parent_ts), l2_cross_edges_d ) graph, _, _, graph_ids = flatgraph.build_gt_graph(edges, make_directed=True) @@ -213,20 +209,31 @@ def add_edges( for cc_indices in components: l2ids_ = graph_ids[cc_indices] new_id = cg.id_client.create_node_id(cg.get_chunk_id(l2ids_[0])) - cg.cache.children_cache[new_id] = np.concatenate( - [atomic_children_d[l2id] for l2id in l2ids_] - ) - cg.cache.atomic_cx_edges_cache[new_id] = concatenate_cross_edge_dicts( - [atomic_cross_edges_d[l2id] for l2id in l2ids_] - ) - cache_utils.update( - cg.cache.parents_cache, cg.cache.children_cache[new_id], new_id - ) new_l2_ids.append(new_id) new_old_id_d[new_id].update(l2ids_) for id_ in l2ids_: old_new_id_d[id_].add(new_id) + # update cache + # map parent to new merged children and vice versa + merged_children = np.concatenate([atomic_children_d[l2id] for l2id in l2ids_]) + cg.cache.children_cache[new_id] = merged_children + cache_utils.update(cg.cache.parents_cache, merged_children, new_id) + + # update cross chunk edges by replacing old_ids with new + # this can be done only after all new IDs have been created + for new_id, cc_indices in zip(new_l2_ids, components): + l2ids_ = graph_ids[cc_indices] + new_cx_edges_d = {} + cx_edges = [cross_edges_d[l2id] for l2id in l2ids_] + cx_edges_d = concatenate_cross_edge_dicts(cx_edges, unique=True) + temp_map = {k: next(iter(v)) for k, v in old_new_id_d.items()} + for layer, edges in cx_edges_d.items(): + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + new_cx_edges_d[layer] = edges + assert np.all(edges[:, 0] == new_id) + cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d + create_parents = CreateParentNodes( cg, new_l2_ids=new_l2_ids, @@ -239,24 +246,42 @@ def add_edges( ) new_roots = create_parents.run() - new_entries = create_parents.create_new_entries() - return new_roots, new_l2_ids, new_entries + sanity_check(cg, new_roots, operation_id) + create_parents.create_new_entries() + return new_roots, new_l2_ids, create_parents.new_entries -def _process_l2_agglomeration( +def _split_l2_agglomeration( + cg, + operation_id: int, agg: types.Agglomeration, removed_edges: np.ndarray, - atomic_cross_edges_d: Dict[int, np.ndarray], + parent_ts: datetime.datetime = None, ): """ - For a given L2 id, remove given edges - and calculate new connected components. + For a given L2 id, remove given edges; calculate new connected components. """ chunk_edges = agg.in_edges.get_pairs() - cross_edges = np.concatenate([types.empty_2d, *atomic_cross_edges_d.values()]) chunk_edges = chunk_edges[~in2d(chunk_edges, removed_edges)] - cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] + cross_edges = agg.cross_edges.get_pairs() + # we must avoid the cache to read roots to get segment state before edit began + parents = cg.get_parents(cross_edges[:, 0], time_stamp=parent_ts, raw_only=True) + + # if there are cross edges, there must be a single parent. + # if there aren't any, there must be no parents. XOR these 2 conditions. + err = f"got cross edges from more than one l2 node; op {operation_id}" + assert (np.unique(parents).size == 1) != (cross_edges.size == 0), err + + if cross_edges.size: + # inactive edges must be filtered out + root = cg.get_root(parents[0], time_stamp=parent_ts, raw_only=True) + neighbor_roots = cg.get_roots( + cross_edges[:, 1], raw_only=True, time_stamp=parent_ts + ) + active_mask = neighbor_roots == root + cross_edges = cross_edges[active_mask] + cross_edges = cross_edges[~in2d(cross_edges, removed_edges)] isolated_ids = agg.supervoxels[~np.in1d(agg.supervoxels, chunk_edges)] isolated_edges = np.column_stack((isolated_ids, isolated_ids)) graph, _, _, graph_ids = flatgraph.build_gt_graph( @@ -266,13 +291,13 @@ def _process_l2_agglomeration( def _filter_component_cross_edges( - cc_ids: np.ndarray, cross_edges: np.ndarray, cross_edge_layers: np.ndarray + component_ids: np.ndarray, cross_edges: np.ndarray, cross_edge_layers: np.ndarray ) -> Dict[int, np.ndarray]: """ Filters cross edges for a connected component `cc_ids` from `cross_edges` of the complete chunk. """ - mask = np.in1d(cross_edges[:, 0], cc_ids) + mask = np.in1d(cross_edges[:, 0], component_ids) cross_edges_ = cross_edges[mask] cross_edge_layers_ = cross_edge_layers[mask] edges_d = {} @@ -288,45 +313,57 @@ def remove_edges( cg, *, atomic_edges: Iterable[np.ndarray], - l2id_agglomeration_d: Dict, - operation_id: basetypes.OPERATION_ID = None, + operation_id: basetypes.OPERATION_ID = None, # type: ignore time_stamp: datetime.datetime = None, parent_ts: datetime.datetime = None, ): edges, _ = _analyze_affected_edges(cg, atomic_edges, parent_ts=parent_ts) l2ids = np.unique(edges) - assert ( - np.unique(cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts)).size - == 1 - ), "L2 IDs must belong to same root." - new_old_id_d, old_new_id_d, old_hierarchy_d = _init_old_hierarchy( - cg, l2ids, parent_ts=parent_ts + roots = cg.get_roots(l2ids, assert_roots=True, time_stamp=parent_ts) + assert np.unique(roots).size == 1, "L2 IDs must belong to same root." + + l2id_agglomeration_d, _ = cg.get_l2_agglomerations( + l2ids, active=True, time_stamp=parent_ts ) - l2id_chunk_id_d = dict(zip(l2ids.tolist(), cg.get_chunk_ids_from_node_ids(l2ids))) - atomic_cross_edges_d = cg.get_atomic_cross_edges(l2ids) + new_old_id_d = defaultdict(set) + old_new_id_d = defaultdict(set) + old_hierarchy_d = _init_old_hierarchy(cg, l2ids, parent_ts=parent_ts) + chunk_id_map = dict(zip(l2ids.tolist(), cg.get_chunk_ids_from_node_ids(l2ids))) removed_edges = np.concatenate([atomic_edges, atomic_edges[:, ::-1]], axis=0) new_l2_ids = [] for id_ in l2ids: - l2_agg = l2id_agglomeration_d[id_] - ccs, graph_ids, cross_edges = _process_l2_agglomeration( - l2_agg, removed_edges, atomic_cross_edges_d[id_] + agg = l2id_agglomeration_d[id_] + ccs, graph_ids, cross_edges = _split_l2_agglomeration( + cg, operation_id, agg, removed_edges, parent_ts ) - # calculated here to avoid repeat computation in loop + new_parents = cg.id_client.create_node_ids(chunk_id_map[agg.node_id], len(ccs)) + cross_edge_layers = cg.get_cross_chunk_edges_layer(cross_edges) - new_parent_ids = cg.id_client.create_node_ids( - l2id_chunk_id_d[l2_agg.node_id], len(ccs) - ) for i_cc, cc in enumerate(ccs): - new_id = new_parent_ids[i_cc] - cg.cache.children_cache[new_id] = graph_ids[cc] - cg.cache.atomic_cx_edges_cache[new_id] = _filter_component_cross_edges( - graph_ids[cc], cross_edges, cross_edge_layers - ) - cache_utils.update(cg.cache.parents_cache, graph_ids[cc], new_id) + new_id = new_parents[i_cc] new_l2_ids.append(new_id) new_old_id_d[new_id].add(id_) old_new_id_d[id_].add(new_id) + cg.cache.children_cache[new_id] = graph_ids[cc] + cache_utils.update(cg.cache.parents_cache, graph_ids[cc], new_id) + cg.cache.cross_chunk_edges_cache[new_id] = _filter_component_cross_edges( + graph_ids[cc], cross_edges, cross_edge_layers + ) + + cx_edges_d = cg.get_cross_chunk_edges(new_l2_ids, time_stamp=parent_ts) + for new_id in new_l2_ids: + new_cx_edges_d = cx_edges_d.get(new_id, {}) + for layer, edges in new_cx_edges_d.items(): + svs = np.unique(edges) + parents = cg.get_parents(svs, time_stamp=parent_ts) + temp_map = dict(zip(svs, parents)) + + edges = fastremap.remap(edges, temp_map, preserve_missing_labels=True) + edges = np.unique(edges, axis=0) + new_cx_edges_d[layer] = edges + assert np.all(edges[:, 0] == new_id) + cg.cache.cross_chunk_edges_cache[new_id] = new_cx_edges_d create_parents = CreateParentNodes( cg, @@ -339,8 +376,144 @@ def remove_edges( parent_ts=parent_ts, ) new_roots = create_parents.run() - new_entries = create_parents.create_new_entries() - return new_roots, new_l2_ids, new_entries + sanity_check(cg, new_roots, operation_id) + create_parents.create_new_entries() + return new_roots, new_l2_ids, create_parents.new_entries + + +def _get_flipped_ids(id_map, node_ids): + """ + returns old or new ids according to the map + """ + ids = [ + np.array(list(id_map[id_]), dtype=basetypes.NODE_ID, copy=False) + for id_ in node_ids + ] + ids.append(types.empty_1d) # concatenate needs at least one array + return np.concatenate(ids) + + +def _get_descendants(cg, new_id): + """get all descendants at layers >= 2""" + result = [] + children = cg.get_children(new_id) + while True: + mask = cg.get_chunk_layers(children) >= 2 + children = children[mask] + result.extend(children) + + mask = cg.get_chunk_layers(children) > 2 + children = children[mask] + if children.size == 0: + break + + children = cg.get_children(children, flatten=True) + return result + + +def _update_neighbor_cross_edges_single( + cg, new_id: int, cx_edges_d: dict, node_map: dict, *, parent_ts +) -> dict: + """ + For each new_id, get counterparts and update its cross chunk edges. + Some of them maybe updated multiple times so we need to collect them first + and then write to storage to consolidate the mutations. + Returns updated counterparts. + """ + node_layer = cg.get_chunk_layer(new_id) + counterparts = [] + counterpart_layers = {} + for layer in range(node_layer, cg.meta.layer_count): + layer_edges = cx_edges_d.get(layer, types.empty_2d) + counterparts.extend(layer_edges[:, 1]) + layers_d = dict(zip(layer_edges[:, 1], [layer] * len(layer_edges[:, 1]))) + counterpart_layers.update(layers_d) + + cp_cx_edges_d = cg.get_cross_chunk_edges(counterparts, time_stamp=parent_ts) + updated_counterparts = {} + for counterpart, edges_d in cp_cx_edges_d.items(): + val_dict = {} + counterpart_layer = counterpart_layers[counterpart] + for layer in range(node_layer, cg.meta.layer_count): + edges = edges_d.get(layer, types.empty_2d) + if edges.size == 0: + continue + assert np.all(edges[:, 0] == counterpart) + edges = fastremap.remap(edges, node_map, preserve_missing_labels=True) + if layer == counterpart_layer: + reverse_edge = np.array([counterpart, new_id], dtype=basetypes.NODE_ID) + edges = np.concatenate([edges, [reverse_edge]]) + descendants = _get_descendants(cg, new_id) + mask = np.isin(edges[:, 1], descendants) + if np.any(mask): + masked_edges = edges[mask] + masked_edges[:, 1] = new_id + edges[mask] = masked_edges + edges = np.unique(edges, axis=0) + edges_d[layer] = edges + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + if not val_dict: + continue + cg.cache.cross_chunk_edges_cache[counterpart] = edges_d + updated_counterparts[counterpart] = val_dict + return updated_counterparts + + +def _update_neighbor_cross_edges( + cg, + new_ids: List[int], + new_old_id: dict, + old_new_id, + *, + time_stamp, + parent_ts, +) -> List: + """ + For each new_id, get counterparts and update its cross chunk edges. + Some of them maybe updated multiple times so we need to collect them first + and then write to storage to consolidate the mutations. + Returns mutations to updated counterparts/partner nodes. + """ + updated_counterparts = {} + newid_cx_edges_d = cg.get_cross_chunk_edges(new_ids, time_stamp=parent_ts) + node_map = {} + for k, v in old_new_id.items(): + if len(v) == 1: + node_map[k] = next(iter(v)) + + for new_id in new_ids: + cx_edges_d = newid_cx_edges_d[new_id] + m = {old_id: new_id for old_id in _get_flipped_ids(new_old_id, [new_id])} + node_map.update(m) + result = _update_neighbor_cross_edges_single( + cg, new_id, cx_edges_d, node_map, parent_ts=parent_ts + ) + updated_counterparts.update(result) + updated_entries = [] + for node, val_dict in updated_counterparts.items(): + rowkey = serialize_uint64(node) + row = cg.client.mutate_row(rowkey, val_dict, time_stamp=time_stamp) + updated_entries.append(row) + return updated_entries + + +def get_supervoxels(cg, node_ids): + """Returns the first supervoxel found for each node_id.""" + result = {} + node_ids_copy = np.copy(node_ids) + children = np.copy(node_ids) + children_d = cg.get_children(node_ids) + while True: + children = [children_d[k][0] for k in children] + children = np.array(children, dtype=basetypes.NODE_ID) + mask = cg.get_chunk_layers(children) == 1 + result.update([(node, sv) for node, sv in zip(node_ids[mask], children[mask])]) + node_ids = node_ids[~mask] + children = children[~mask] + if children.size == 0: + break + children_d = cg.get_children(children) + return np.array([result[k] for k in node_ids_copy], dtype=basetypes.NODE_ID) class CreateParentNodes: @@ -349,31 +522,32 @@ def __init__( cg, *, new_l2_ids: Iterable, - operation_id: basetypes.OPERATION_ID, + operation_id: basetypes.OPERATION_ID, # type: ignore time_stamp: datetime.datetime, - new_old_id_d: Dict[np.uint64, Iterable[np.uint64]] = None, - old_new_id_d: Dict[np.uint64, Iterable[np.uint64]] = None, + new_old_id_d: Dict[np.uint64, Set[np.uint64]] = None, + old_new_id_d: Dict[np.uint64, Set[np.uint64]] = None, old_hierarchy_d: Dict[np.uint64, Dict[int, np.uint64]] = None, parent_ts: datetime.datetime = None, ): self.cg = cg + self.new_entries = [] self._new_l2_ids = new_l2_ids self._old_hierarchy_d = old_hierarchy_d self._new_old_id_d = new_old_id_d self._old_new_id_d = old_new_id_d - self._new_ids_d = defaultdict(list) # new IDs in each layer - self._cross_edges_d = {} + self._new_ids_d = defaultdict(list) self._operation_id = operation_id self._time_stamp = time_stamp self._last_successful_ts = parent_ts def _update_id_lineage( self, - parent: basetypes.NODE_ID, + parent: basetypes.NODE_ID, # type: ignore children: np.ndarray, layer: int, parent_layer: int, ): + # update newly created children; mask others mask = np.in1d(children, self._new_ids_d[layer]) for child_id in children[mask]: child_old_ids = self._new_old_id_d[child_id] @@ -382,90 +556,73 @@ def _update_id_lineage( self._new_old_id_d[parent].add(old_id) self._old_new_id_d[old_id].add(parent) - def _get_old_ids(self, new_ids): - old_ids = [ - np.array(list(self._new_old_id_d[id_]), dtype=basetypes.NODE_ID) - for id_ in new_ids - ] - return np.concatenate(old_ids) - - def _map_sv_to_parent(self, node_ids, layer, node_map=None): - sv_parent_d = {} - sv_cross_edges = [types.empty_2d] - if node_map is None: - node_map = {} - for id_ in node_ids: - id_eff = node_map.get(id_, id_) - edges_ = self._cross_edges_d[id_].get(layer, types.empty_2d) - sv_parent_d.update(dict(zip(edges_[:, 0], [id_eff] * len(edges_)))) - sv_cross_edges.append(edges_) - return sv_parent_d, np.concatenate(sv_cross_edges) - - def _get_connected_components( - self, node_ids: np.ndarray, layer: int, lower_layer_ids: np.ndarray - ): - _node_ids = np.concatenate([node_ids, lower_layer_ids]) - cached = np.fromiter(self._cross_edges_d.keys(), dtype=basetypes.NODE_ID) - not_cached = _node_ids[~np.in1d(_node_ids, cached)] - + def _get_connected_components(self, node_ids: np.ndarray, layer: int): with TimeIt( f"get_cross_chunk_edges.{layer}", self.cg.graph_id, self._operation_id, ): - self._cross_edges_d.update( - self.cg.get_cross_chunk_edges(not_cached, all_layers=True) + cross_edges_d = self.cg.get_cross_chunk_edges( + node_ids, time_stamp=self._last_successful_ts ) - sv_parent_d, sv_cross_edges = self._map_sv_to_parent(node_ids, layer) - get_sv_parents = np.vectorize(sv_parent_d.get, otypes=[np.uint64]) - try: - cross_edges = get_sv_parents(sv_cross_edges) - except TypeError: # NoneType error - # if there is a missing parent, try including lower layer ids - # this can happen due to skip connections - - # we want to map all these lower IDs to the current layer - lower_layer_to_layer = self.cg.get_roots( - lower_layer_ids, stop_layer=layer, ceil=False - ) - node_map = {k: v for k, v in zip(lower_layer_ids, lower_layer_to_layer)} - sv_parent_d, sv_cross_edges = self._map_sv_to_parent( - _node_ids, layer, node_map=node_map - ) - get_sv_parents = np.vectorize(sv_parent_d.get, otypes=[np.uint64]) - cross_edges = get_sv_parents(sv_cross_edges) - - cross_edges = np.concatenate([cross_edges, np.vstack([node_ids, node_ids]).T]) - graph, _, _, graph_ids = flatgraph.build_gt_graph( - cross_edges, make_directed=True - ) + cx_edges = [types.empty_2d] + for id_ in node_ids: + edges_ = cross_edges_d[id_].get(layer, types.empty_2d) + cx_edges.append(edges_) + cx_edges = np.concatenate([*cx_edges, np.vstack([node_ids, node_ids]).T]) + graph, _, _, graph_ids = flatgraph.build_gt_graph(cx_edges, make_directed=True) return flatgraph.connected_components(graph), graph_ids def _get_layer_node_ids( self, new_ids: np.ndarray, layer: int ) -> Tuple[np.ndarray, np.ndarray]: # get old identities of new IDs - old_ids = self._get_old_ids(new_ids) + old_ids = _get_flipped_ids(self._new_old_id_d, new_ids) # get their parents, then children of those parents - node_ids = self.cg.get_children( - np.unique( - self.cg.get_parents(old_ids, time_stamp=self._last_successful_ts) - ), - flatten=True, - ) + old_parents = self.cg.get_parents(old_ids, time_stamp=self._last_successful_ts) + siblings = self.cg.get_children(np.unique(old_parents), flatten=True) # replace old identities with new IDs - mask = np.in1d(node_ids, old_ids) + mask = np.in1d(siblings, old_ids) node_ids = np.concatenate( - [ - np.array(list(self._old_new_id_d[id_]), dtype=basetypes.NODE_ID) - for id_ in node_ids[mask] - ] - + [node_ids[~mask], new_ids] + [_get_flipped_ids(self._old_new_id_d, old_ids), siblings[~mask], new_ids] ) node_ids = np.unique(node_ids) layer_mask = self.cg.get_chunk_layers(node_ids) == layer - return node_ids[layer_mask], node_ids[~layer_mask] + return node_ids[layer_mask] + # return node_ids + + def _update_cross_edge_cache(self, parent, children): + """ + updates cross chunk edges in cache; + this can only be done after all new components at a layer have IDs + """ + parent_layer = self.cg.get_chunk_layer(parent) + if parent_layer == 2: + # l2 cross edges have already been updated by this point + return + cx_edges_d = self.cg.get_cross_chunk_edges( + children, time_stamp=self._last_successful_ts + ) + cx_edges_d = concatenate_cross_edge_dicts(cx_edges_d.values()) + edge_nodes = np.unique(np.concatenate([*cx_edges_d.values(), types.empty_2d])) + edge_supervoxels = get_supervoxels(self.cg, edge_nodes) + edge_parents = self.cg.get_roots( + edge_supervoxels, + stop_layer=parent_layer, + ceil=False, + time_stamp=self._last_successful_ts, + ) + edge_parents_d = dict(zip(edge_nodes, edge_parents)) + new_cx_edges_d = {} + for layer in range(parent_layer, self.cg.meta.layer_count): + edges = cx_edges_d.get(layer, types.empty_2d) + if len(edges) == 0: + continue + edges = fastremap.remap(edges, edge_parents_d, preserve_missing_labels=True) + new_cx_edges_d[layer] = np.unique(edges, axis=0) + assert np.all(edges[:, 0] == parent), f"{parent}, {np.unique(edges[:, 0])}" + self.cg.cache.cross_chunk_edges_cache[parent] = new_cx_edges_d def _create_new_parents(self, layer: int): """ @@ -478,33 +635,45 @@ def _create_new_parents(self, layer: int): update parent old IDs """ new_ids = self._new_ids_d[layer] - layer_node_ids, lower_layer_ids = self._get_layer_node_ids(new_ids, layer) - components, graph_ids = self._get_connected_components( - layer_node_ids, layer, lower_layer_ids - ) + layer_node_ids = self._get_layer_node_ids(new_ids, layer) + components, graph_ids = self._get_connected_components(layer_node_ids, layer) for cc_indices in components: - parent_layer = layer + 1 + parent_layer = layer + 1 # must be reset for each connected component cc_ids = graph_ids[cc_indices] if len(cc_ids) == 1: # skip connection parent_layer = self.cg.meta.layer_count for l in range(layer + 1, self.cg.meta.layer_count): - if len(self._cross_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: + cx_edges_d = self.cg.get_cross_chunk_edges( + [cc_ids[0]], time_stamp=self._last_successful_ts + ) + if len(cx_edges_d[cc_ids[0]].get(l, types.empty_2d)) > 0: parent_layer = l break - - parent_id = self.cg.id_client.create_node_id( + parent = self.cg.id_client.create_node_id( self.cg.get_parent_chunk_id(cc_ids[0], parent_layer), root_chunk=parent_layer == self.cg.meta.layer_count, ) - self._new_ids_d[parent_layer].append(parent_id) - self.cg.cache.children_cache[parent_id] = cc_ids - cache_utils.update( - self.cg.cache.parents_cache, - cc_ids, - parent_id, - ) - self._update_id_lineage(parent_id, cc_ids, layer, parent_layer) + self._new_ids_d[parent_layer].append(parent) + self._update_id_lineage(parent, cc_ids, layer, parent_layer) + self.cg.cache.children_cache[parent] = cc_ids + cache_utils.update(self.cg.cache.parents_cache, cc_ids, parent) + + try: + sanity_check_single(self.cg, parent, self._operation_id) + except AssertionError: + from pychunkedgraph.debug.utils import get_l2children + + pairs = [ + (a, b) for idx, a in enumerate(cc_ids) for b in cc_ids[idx + 1 :] + ] + for c1, c2 in pairs: + l2c1 = get_l2children(self.cg, c1) + l2c2 = get_l2children(self.cg, c2) + if np.intersect1d(l2c1, l2c2).size: + c = np.intersect1d(l2c1, l2c2) + msg = f"{self._operation_id}: {layer} {c1} {c2} have common children {c}" + raise ValueError(msg) def run(self) -> Iterable: """ @@ -515,28 +684,38 @@ def run(self) -> Iterable: for layer in range(2, self.cg.meta.layer_count): if len(self._new_ids_d[layer]) == 0: continue - with TimeIt( - f"create_new_parents_layer.{layer}", - self.cg.graph_id, - self._operation_id, - ): + # all new IDs in this layer have been created + # update their cross chunk edges and their neighbors' + m = f"create_new_parents_layer.{layer}" + with TimeIt(m, self.cg.graph_id, self._operation_id): + for new_id in self._new_ids_d[layer]: + children = self.cg.get_children(new_id) + self._update_cross_edge_cache(new_id, children) + entries = _update_neighbor_cross_edges( + self.cg, + self._new_ids_d[layer], + self._new_old_id_d, + self._old_new_id_d, + time_stamp=self._time_stamp, + parent_ts=self._last_successful_ts, + ) + self.new_entries.extend(entries) self._create_new_parents(layer) return self._new_ids_d[self.cg.meta.layer_count] def _update_root_id_lineage(self): - new_root_ids = self._new_ids_d[self.cg.meta.layer_count] - former_root_ids = self._get_old_ids(new_root_ids) - former_root_ids = np.unique(former_root_ids) - assert ( - len(former_root_ids) < 2 or len(new_root_ids) < 2 - ), "Something went wrong." - rows = [] - for new_root_id in new_root_ids: + new_roots = self._new_ids_d[self.cg.meta.layer_count] + former_roots = _get_flipped_ids(self._new_old_id_d, new_roots) + former_roots = np.unique(former_roots) + + err = f"new roots are inconsistent; op {self._operation_id}" + assert len(former_roots) < 2 or len(new_roots) < 2, err + for new_root_id in new_roots: val_dict = { - attributes.Hierarchy.FormerParent: np.array(former_root_ids), + attributes.Hierarchy.FormerParent: former_roots, attributes.OperationLogs.OperationID: self._operation_id, } - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(new_root_id), val_dict, @@ -544,44 +723,48 @@ def _update_root_id_lineage(self): ) ) - for former_root_id in former_root_ids: + for former_root_id in former_roots: val_dict = { - attributes.Hierarchy.NewParent: np.array(new_root_ids), + attributes.Hierarchy.NewParent: np.array( + new_roots, dtype=basetypes.NODE_ID + ), attributes.OperationLogs.OperationID: self._operation_id, } - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(former_root_id), val_dict, time_stamp=self._time_stamp, ) ) - return rows - def _get_atomic_cross_edges_val_dict(self): - new_ids = np.array(self._new_ids_d[2], dtype=basetypes.NODE_ID) + def _get_cross_edges_val_dicts(self): val_dicts = {} - atomic_cross_edges_d = self.cg.get_atomic_cross_edges(new_ids) - for id_ in new_ids: - val_dict = {} - for layer, edges in atomic_cross_edges_d[id_].items(): - val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges - val_dicts[id_] = val_dict + for layer in range(2, self.cg.meta.layer_count): + new_ids = np.array(self._new_ids_d[layer], dtype=basetypes.NODE_ID) + cross_edges_d = self.cg.get_cross_chunk_edges( + new_ids, time_stamp=self._last_successful_ts + ) + for id_ in new_ids: + val_dict = {} + for layer, edges in cross_edges_d[id_].items(): + val_dict[attributes.Connectivity.CrossChunkEdge[layer]] = edges + val_dicts[id_] = val_dict return val_dicts def create_new_entries(self) -> List: - rows = [] - val_dicts = self._get_atomic_cross_edges_val_dict() + val_dicts = self._get_cross_edges_val_dicts() for layer in range(2, self.cg.meta.layer_count + 1): new_ids = self._new_ids_d[layer] for id_ in new_ids: val_dict = val_dicts.get(id_, {}) children = self.cg.get_children(id_) + err = f"parent layer less than children; op {self._operation_id}" assert np.max( self.cg.get_chunk_layers(children) - ) < self.cg.get_chunk_layer(id_), "Parent layer less than children." + ) < self.cg.get_chunk_layer(id_), err val_dict[attributes.Hierarchy.Child] = children - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(id_), val_dict, @@ -589,11 +772,11 @@ def create_new_entries(self) -> List: ) ) for child_id in children: - rows.append( + self.new_entries.append( self.cg.client.mutate_row( serialize_uint64(child_id), {attributes.Hierarchy.Parent: id_}, time_stamp=self._time_stamp, ) ) - return rows + self._update_root_id_lineage() + self._update_root_id_lineage() diff --git a/pychunkedgraph/graph/misc.py b/pychunkedgraph/graph/misc.py index b33e8a6fd..0f53c71c3 100644 --- a/pychunkedgraph/graph/misc.py +++ b/pychunkedgraph/graph/misc.py @@ -8,7 +8,6 @@ import fastremap import numpy as np -from multiwrapper import multiprocessing_utils as mu from . import ChunkedGraph from . import attributes @@ -51,22 +50,6 @@ def _read_delta_root_rows( return new_root_ids, expired_root_ids -def _read_root_rows_thread(args) -> list: - start_seg_id, end_seg_id, serialized_cg_info, time_stamp = args - cg = ChunkedGraph(**serialized_cg_info) - start_id = cg.get_node_id(segment_id=start_seg_id, chunk_id=cg.root_chunk_id) - end_id = cg.get_node_id(segment_id=end_seg_id, chunk_id=cg.root_chunk_id) - rows = cg.client.read_nodes( - start_id=start_id, - end_id=end_id, - end_id_inclusive=False, - end_time=time_stamp, - end_time_inclusive=True, - ) - root_ids = [k for (k, v) in rows.items() if attributes.Hierarchy.NewParent not in v] - return root_ids - - def get_proofread_root_ids( cg: ChunkedGraph, start_time: Optional[datetime.datetime] = None, @@ -94,43 +77,12 @@ def get_proofread_root_ids( def get_latest_roots( - cg, time_stamp: Optional[datetime.datetime] = None, n_threads: int = 1 + cg: ChunkedGraph, time_stamp: Optional[datetime.datetime] = None, n_threads: int = 1 ) -> Sequence[np.uint64]: - # Create filters: time and id range - max_seg_id = cg.get_max_seg_id(cg.root_chunk_id) + 1 - n_blocks = 1 if n_threads == 1 else int(np.min([n_threads * 3 + 1, max_seg_id])) - seg_id_blocks = np.linspace(1, max_seg_id, n_blocks + 1, dtype=np.uint64) - cg_serialized_info = cg.get_serialized_info() - if n_threads > 1: - del cg_serialized_info["credentials"] - - multi_args = [] - for i_id_block in range(0, len(seg_id_blocks) - 1): - multi_args.append( - [ - seg_id_blocks[i_id_block], - seg_id_blocks[i_id_block + 1], - cg_serialized_info, - time_stamp, - ] - ) - - if n_threads == 1: - results = mu.multiprocess_func( - _read_root_rows_thread, - multi_args, - n_threads=n_threads, - verbose=False, - debug=n_threads == 1, - ) - else: - results = mu.multisubprocess_func( - _read_root_rows_thread, multi_args, n_threads=n_threads - ) - root_ids = [] - for result in results: - root_ids.extend(result) - return np.array(root_ids, dtype=np.uint64) + root_chunk = cg.get_chunk_id(layer=cg.meta.layer_count, x=0, y=0, z=0) + rr = cg.range_read_chunk(root_chunk, time_stamp=time_stamp) + roots = [k for k, v in rr.items() if attributes.Hierarchy.NewParent not in v] + return np.array(roots, dtype=np.uint64) def get_delta_roots( @@ -202,7 +154,6 @@ def get_contact_sites( # Load edges of these cs_svs edges_cs_svs_rows = cg.client.read_nodes( node_ids=u_cs_svs, - # columns=[attributes.Connectivity.Partner, attributes.Connectivity.Connected], ) pre_cs_edges = [] for ri in edges_cs_svs_rows.items(): diff --git a/pychunkedgraph/graph/operation.py b/pychunkedgraph/graph/operation.py index d0d0e172a..8c5d4484e 100644 --- a/pychunkedgraph/graph/operation.py +++ b/pychunkedgraph/graph/operation.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name, missing-docstring, too-many-lines, protected-access +# pylint: disable=invalid-name, missing-docstring, too-many-lines, protected-access, broad-exception-raised from abc import ABC, abstractmethod from collections import namedtuple @@ -457,6 +457,9 @@ def execute( except PostconditionError as err: self.cg.cache = None raise PostconditionError(err) from err + except (AssertionError, RuntimeError) as err: + self.cg.cache = None + raise RuntimeError(err) from err except Exception as err: # unknown exception, update log record with error self.cg.cache = None @@ -469,7 +472,7 @@ def execute( exception=repr(err), ) self.cg.client.write([log_record_error]) - raise Exception(err) + raise Exception(err) from err with TimeIt(f"{op_type}.write", self.cg.graph_id, lock.operation_id): result = self._write( @@ -612,13 +615,16 @@ def _apply( edges_only=True, ) - with TimeIt("preprocess", self.cg.graph_id, operation_id): - inactive_edges = edits.merge_preprocess( - self.cg, - subgraph_edges=edges, - supervoxels=self.added_edges.ravel(), - parent_ts=self.parent_ts, - ) + if self.allow_same_segment_merge: + inactive_edges = types.empty_2d + else: + with TimeIt("preprocess", self.cg.graph_id, operation_id): + inactive_edges = edits.merge_preprocess( + self.cg, + subgraph_edges=edges, + supervoxels=self.added_edges.ravel(), + parent_ts=self.parent_ts, + ) atomic_edges, fake_edge_rows = edits.check_fake_edges( self.cg, @@ -634,6 +640,7 @@ def _apply( operation_id=operation_id, time_stamp=timestamp, parent_ts=self.parent_ts, + allow_same_segment_merge=self.allow_same_segment_merge, ) return new_roots, new_l2_ids, fake_edge_rows + new_entries @@ -744,18 +751,11 @@ def _apply( ): raise PreconditionError("Supervoxels must belong to the same object.") - with TimeIt("subgraph", self.cg.graph_id, operation_id): - l2id_agglomeration_d, _ = self.cg.get_l2_agglomerations( - self.cg.get_parents( - self.removed_edges.ravel(), time_stamp=self.parent_ts - ), - ) with TimeIt("remove_edges", self.cg.graph_id, operation_id): return edits.remove_edges( self.cg, operation_id=operation_id, atomic_edges=self.removed_edges, - l2id_agglomeration_d=l2id_agglomeration_d, time_stamp=timestamp, parent_ts=self.parent_ts, ) @@ -892,11 +892,11 @@ def _apply( self.cg.meta.split_bounding_offset, ) with TimeIt("get_subgraph", self.cg.graph_id, operation_id): - l2id_agglomeration_d, edges = self.cg.get_subgraph( + l2id_agglomeration_d, edges_tuple = self.cg.get_subgraph( root_ids.pop(), bbox=bbox, bbox_is_coordinate=True ) - edges = reduce(lambda x, y: x + y, edges, Edges([], [])) + edges = reduce(lambda x, y: x + y, edges_tuple, Edges([], [])) supervoxels = np.concatenate( [agg.supervoxels for agg in l2id_agglomeration_d.values()] ) @@ -922,7 +922,6 @@ def _apply( self.cg, operation_id=operation_id, atomic_edges=self.removed_edges, - l2id_agglomeration_d=l2id_agglomeration_d, time_stamp=timestamp, parent_ts=self.parent_ts, ) diff --git a/pychunkedgraph/graph/subgraph.py b/pychunkedgraph/graph/subgraph.py index ab2593175..1538b3cc2 100644 --- a/pychunkedgraph/graph/subgraph.py +++ b/pychunkedgraph/graph/subgraph.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel + from typing import List from typing import Dict from typing import Tuple @@ -30,9 +32,7 @@ def __init__(self, meta, node_ids, return_layers, serializable): # "Frontier" of nodes that cg.get_children will be called on self.cur_nodes = np.array(list(node_ids), dtype=np.uint64) # Mapping of current frontier to self.node_ids - self.cur_nodes_to_original_nodes = dict( - zip(self.cur_nodes, self.cur_nodes) - ) + self.cur_nodes_to_original_nodes = dict(zip(self.cur_nodes, self.cur_nodes)) self.stop_layer = max(1, min(return_layers)) self.create_initial_node_to_subgraph() @@ -107,13 +107,11 @@ def flatten_subgraph(self): for node_id in self.node_ids: for return_layer in self.return_layers: node_key = self.get_dict_key(node_id) - children_at_layer = self.node_to_subgraph[node_key][ - return_layer - ] + children_at_layer = self.node_to_subgraph[node_key][return_layer] if len(children_at_layer) > 0: - self.node_to_subgraph[node_key][ - return_layer - ] = np.concatenate(children_at_layer) + self.node_to_subgraph[node_key][return_layer] = np.concatenate( + children_at_layer + ) else: self.node_to_subgraph[node_key][return_layer] = empty_1d @@ -123,10 +121,12 @@ def get_subgraph_nodes( node_id_or_ids: Union[np.uint64, Iterable], bbox: Optional[Sequence[Sequence[int]]] = None, bbox_is_coordinate: bool = False, - return_layers: List = [2], + return_layers: List = None, serializable: bool = False, - return_flattened: bool = False + return_flattened: bool = False, ) -> Tuple[Dict, Dict, Edges]: + if return_layers is None: + return_layers = [2] single = False node_ids = node_id_or_ids bbox = normalize_bounding_box(cg.meta, bbox, bbox_is_coordinate) @@ -139,7 +139,7 @@ def get_subgraph_nodes( bounding_box=bbox, return_layers=return_layers, serializable=serializable, - return_flattened=return_flattened + return_flattened=return_flattened, ) if single: if serializable: @@ -155,7 +155,7 @@ def get_subgraph_edges_and_leaves( bbox_is_coordinate: bool = False, edges_only: bool = False, leaves_only: bool = False, -) -> Tuple[Dict, Dict, Edges]: +) -> Tuple[Dict, Tuple[Edges]]: """Get the edges and/or leaves of the specified node_ids within the specified bounding box.""" from .types import empty_1d @@ -183,7 +183,7 @@ def _get_subgraph_multiple_nodes( bounding_box: Optional[Sequence[Sequence[int]]], return_layers: Sequence[int], serializable: bool = False, - return_flattened: bool = False + return_flattened: bool = False, ): from collections import ChainMap from multiwrapper.multiprocessing_utils import n_cpus @@ -223,9 +223,7 @@ def _get_subgraph_multiple_nodes_threaded( subgraph = SubgraphProgress(cg.meta, node_ids, return_layers, serializable) while not subgraph.done_processing(): - this_n_threads = min( - [int(len(subgraph.cur_nodes) // 50000) + 1, n_cpus] - ) + this_n_threads = min([int(len(subgraph.cur_nodes) // 50000) + 1, n_cpus]) cur_nodes_child_maps = multithread_func( _get_subgraph_multiple_nodes_threaded, np.array_split(subgraph.cur_nodes, this_n_threads), @@ -239,8 +237,6 @@ def _get_subgraph_multiple_nodes_threaded( for node_id in node_ids: subgraph.node_to_subgraph[ _get_dict_key(node_id) - ] = subgraph.node_to_subgraph[_get_dict_key(node_id)][ - return_layers[0] - ] + ] = subgraph.node_to_subgraph[_get_dict_key(node_id)][return_layers[0]] - return subgraph.node_to_subgraph \ No newline at end of file + return subgraph.node_to_subgraph diff --git a/pychunkedgraph/graph/types.py b/pychunkedgraph/graph/types.py index 9a551f35c..1f35e5f6b 100644 --- a/pychunkedgraph/graph/types.py +++ b/pychunkedgraph/graph/types.py @@ -1,5 +1,4 @@ -from typing import Dict -from typing import Iterable +# pylint: disable=invalid-name, missing-docstring from collections import namedtuple import numpy as np diff --git a/pychunkedgraph/graph/utils/basetypes.py b/pychunkedgraph/graph/utils/basetypes.py index e55324e6a..c6b0b1974 100644 --- a/pychunkedgraph/graph/utils/basetypes.py +++ b/pychunkedgraph/graph/utils/basetypes.py @@ -1,16 +1,16 @@ import numpy as np -CHUNK_ID = SEGMENT_ID = NODE_ID = OPERATION_ID = np.dtype('uint64').newbyteorder('L') -EDGE_AFFINITY = np.dtype('float32').newbyteorder('L') -EDGE_AREA = np.dtype('uint64').newbyteorder('L') +CHUNK_ID = SEGMENT_ID = NODE_ID = OPERATION_ID = np.dtype("uint64").newbyteorder("L") +EDGE_AFFINITY = np.dtype("float32").newbyteorder("L") +EDGE_AREA = np.dtype("uint64").newbyteorder("L") -COUNTER = np.dtype('int64').newbyteorder('B') +COUNTER = np.dtype("int64").newbyteorder("B") -COORDINATES = np.dtype('int64').newbyteorder('L') -CHUNKSIZE = np.dtype('uint64').newbyteorder('L') -FANOUT = np.dtype('uint64').newbyteorder('L') -LAYERCOUNT = np.dtype('uint64').newbyteorder('L') -SPATIALBITS = np.dtype('uint64').newbyteorder('L') -ROOTCOUNTERBITS = np.dtype('uint64').newbyteorder('L') -SKIPCONNECTIONS = np.dtype('uint64').newbyteorder('L') \ No newline at end of file +COORDINATES = np.dtype("int64").newbyteorder("L") +CHUNKSIZE = np.dtype("uint64").newbyteorder("L") +FANOUT = np.dtype("uint64").newbyteorder("L") +LAYERCOUNT = np.dtype("uint64").newbyteorder("L") +SPATIALBITS = np.dtype("uint64").newbyteorder("L") +ROOTCOUNTERBITS = np.dtype("uint64").newbyteorder("L") +SKIPCONNECTIONS = np.dtype("uint64").newbyteorder("L") diff --git a/pychunkedgraph/graph/utils/flatgraph.py b/pychunkedgraph/graph/utils/flatgraph.py index df469d728..03cb6e2d2 100644 --- a/pychunkedgraph/graph/utils/flatgraph.py +++ b/pychunkedgraph/graph/utils/flatgraph.py @@ -1,8 +1,11 @@ +# pylint: disable=invalid-name, missing-docstring, c-extension-no-member + +from itertools import combinations, chain + import fastremap import numpy as np -from itertools import combinations, chain from graph_tool import Graph, GraphView -from graph_tool import topology, search +from graph_tool import topology def build_gt_graph( @@ -88,7 +91,10 @@ def team_paths_all_to_all(graph, capacity, team_vertex_ids): def neighboring_edges(graph, vertex_id): - """Returns vertex and edge lists of a seed vertex, in the same format as team_paths_all_to_all.""" + """ + Returns vertex and edge lists of a seed vertex, + in the same format as team_paths_all_to_all. + """ add_v = [] add_e = [] v0 = graph.vertex(vertex_id) @@ -124,7 +130,8 @@ def compute_filtered_paths( gfilt, capacity, team_vertex_ids ) - # graph-tool will invalidate the vertex and edge properties if I don't rebase them on the main graph + # graph-tool will invalidate the vertex and + # edge properties if I don't rebase them on the main graph # before tearing down the GraphView new_paths_e = [] for pth in paths_e: diff --git a/pychunkedgraph/ingest/__init__.py b/pychunkedgraph/ingest/__init__.py index b3d832d5e..55c10ca5f 100644 --- a/pychunkedgraph/ingest/__init__.py +++ b/pychunkedgraph/ingest/__init__.py @@ -1,32 +1,16 @@ +import logging from collections import namedtuple - -_cluster_ingest_config_fields = ( - "ATOMIC_Q_NAME", - "ATOMIC_Q_LIMIT", - "ATOMIC_Q_INTERVAL", -) -_cluster_ingest_defaults = ( - "l2", - 100000, - 120, -) -ClusterIngestConfig = namedtuple( - "ClusterIngestConfig", - _cluster_ingest_config_fields, - defaults=_cluster_ingest_defaults, -) - +logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO) _ingestconfig_fields = ( - "CLUSTER", # cluster config "AGGLOMERATION", "WATERSHED", "USE_RAW_EDGES", "USE_RAW_COMPONENTS", "TEST_RUN", ) -_ingestconfig_defaults = (None, None, None, False, False, False) +_ingestconfig_defaults = (None, None, False, False, False) IngestConfig = namedtuple( "IngestConfig", _ingestconfig_fields, defaults=_ingestconfig_defaults ) diff --git a/pychunkedgraph/ingest/cli.py b/pychunkedgraph/ingest/cli.py index 7668e8f24..c50525ec6 100644 --- a/pychunkedgraph/ingest/cli.py +++ b/pychunkedgraph/ingest/cli.py @@ -1,24 +1,32 @@ +# pylint: disable=invalid-name, missing-function-docstring, unspecified-encoding + """ cli for running ingest """ -from os import environ -from time import sleep +import logging import click import yaml from flask.cli import AppGroup -from rq import Queue +from .cluster import create_atomic_chunk, create_parent_chunk, enqueue_l2_tasks from .manager import IngestionManager -from .utils import bootstrap -from .cluster import randomize_grid_points +from .utils import ( + bootstrap, + chunk_id_str, + print_completion_rate, + print_status, + queue_layer_helper, + job_type_guard, +) +from .simple_tests import run_all +from .create.parent_layer import add_parent_chunk from ..graph.chunkedgraph import ChunkedGraph -from ..utils.redis import get_redis_connection -from ..utils.redis import keys as r_keys -from ..utils.general import chunked +from ..utils.redis import get_redis_connection, keys as r_keys -ingest_cli = AppGroup("ingest") +group_name = "ingest" +ingest_cli = AppGroup(group_name) def init_ingest_cmds(app): @@ -26,6 +34,8 @@ def init_ingest_cmds(app): @ingest_cli.command("flush_redis") +@click.confirmation_option(prompt="Are you sure you want to flush redis?") +@job_type_guard(group_name) def flush_redis(): """FLush redis db.""" redis = get_redis_connection() @@ -35,9 +45,10 @@ def flush_redis(): @ingest_cli.command("graph") @click.argument("graph_id", type=str) @click.argument("dataset", type=click.Path(exists=True)) -@click.option("--raw", is_flag=True) -@click.option("--test", is_flag=True) -@click.option("--retry", is_flag=True) +@click.option("--raw", is_flag=True, help="Read edges from agglomeration output.") +@click.option("--test", is_flag=True, help="Test 8 chunks at the center of dataset.") +@click.option("--retry", is_flag=True, help="Rerun without creating a new table.") +@job_type_guard(group_name) def ingest_graph( graph_id: str, dataset: click.Path, raw: bool, test: bool, retry: bool ): @@ -45,27 +56,28 @@ def ingest_graph( Main ingest command. Takes ingest config from a yaml file and queues atomic tasks. """ - from .cluster import enqueue_atomic_tasks - + redis = get_redis_connection() + redis.set(r_keys.JOB_TYPE, group_name) with open(dataset, "r") as stream: config = yaml.safe_load(stream) - meta, ingest_config, client_info = bootstrap( - graph_id, - config=config, - raw=raw, - test_run=test, - ) + if test: + logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.DEBUG) + + meta, ingest_config, client_info = bootstrap(graph_id, config, raw, test) cg = ChunkedGraph(meta=meta, client_info=client_info) if not retry: cg.create() - enqueue_atomic_tasks(IngestionManager(ingest_config, meta)) + + imanager = IngestionManager(ingest_config, meta) + enqueue_l2_tasks(imanager, create_atomic_chunk) @ingest_cli.command("imanager") @click.argument("graph_id", type=str) @click.argument("dataset", type=click.Path(exists=True)) @click.option("--raw", is_flag=True) +@job_type_guard(group_name) def pickle_imanager(graph_id: str, dataset: click.Path, raw: bool): """ Load ingest config into redis server. @@ -79,96 +91,51 @@ def pickle_imanager(graph_id: str, dataset: click.Path, raw: bool): meta, ingest_config, _ = bootstrap(graph_id, config=config, raw=raw) imanager = IngestionManager(ingest_config, meta) - imanager.redis + imanager.redis.set(r_keys.JOB_TYPE, group_name) @ingest_cli.command("layer") @click.argument("parent_layer", type=int) +@job_type_guard(group_name) def queue_layer(parent_layer): """ Queue all chunk tasks at a given layer. Must be used when all the chunks at `parent_layer - 1` have completed. """ - from itertools import product - import numpy as np - from .cluster import create_parent_chunk - from .utils import chunk_id_str - assert parent_layer > 2, "This command is for layers 3 and above." redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - - if parent_layer == imanager.cg_meta.layer_count: - chunk_coords = [(0, 0, 0)] - else: - bounds = imanager.cg_meta.layer_chunk_bounds[parent_layer] - chunk_coords = randomize_grid_points(*bounds) - - def get_chunks_not_done(coords: list) -> list: - """check for set membership in redis in batches""" - coords_strs = ["_".join(map(str, coord)) for coord in coords] - try: - completed = imanager.redis.smismember(f"{parent_layer}c", coords_strs) - except Exception: - return coords - return [coord for coord, c in zip(coords, completed) if not c] - - batch_size = int(environ.get("JOB_BATCH_SIZE", 10000)) - batches = chunked(chunk_coords, batch_size) - q = imanager.get_task_queue(f"l{parent_layer}") - - for batch in batches: - _coords = get_chunks_not_done(batch) - # buffer for optimal use of redis memory - if len(q) > int(environ.get("QUEUE_SIZE", 100000)): - interval = int(environ.get("QUEUE_INTERVAL", 300)) - sleep(interval) - - job_datas = [] - for chunk_coord in _coords: - job_datas.append( - Queue.prepare_data( - create_parent_chunk, - args=(parent_layer, chunk_coord), - result_ttl=0, - job_id=chunk_id_str(parent_layer, chunk_coord), - timeout=f"{int(parent_layer * parent_layer)}m", - ) - ) - q.enqueue_many(job_datas) + queue_layer_helper(parent_layer, imanager, create_parent_chunk) @ingest_cli.command("status") +@job_type_guard(group_name) def ingest_status(): """Print ingest status to console by layer.""" redis = get_redis_connection() - imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - layers = range(2, imanager.cg_meta.layer_count + 1) - for layer, layer_count in zip(layers, imanager.cg_meta.layer_chunk_counts): - completed = redis.scard(f"{layer}c") - print(f"{layer}\t: {completed} / {layer_count}") + try: + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_status(imanager, redis) + except TypeError as err: + print(f"\nNo current `{group_name}` job found in redis: {err}") @ingest_cli.command("chunk") @click.argument("queue", type=str) @click.argument("chunk_info", nargs=4, type=int) +@job_type_guard(group_name) def ingest_chunk(queue: str, chunk_info): """Manually queue chunk when a job is stuck for whatever reason.""" - from .cluster import _create_atomic_chunk - from .cluster import create_parent_chunk - from .utils import chunk_id_str - redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - layer = chunk_info[0] - coords = chunk_info[1:] - queue = imanager.get_task_queue(queue) + layer, coords = chunk_info[0], chunk_info[1:] + + func = create_parent_chunk + args = (layer, coords) if layer == 2: - func = _create_atomic_chunk + func = create_atomic_chunk args = (coords,) - else: - func = create_parent_chunk - args = (layer, coords) + queue = imanager.get_task_queue(queue) queue.enqueue( func, job_id=chunk_id_str(layer, coords), @@ -182,13 +149,31 @@ def ingest_chunk(queue: str, chunk_info): @click.argument("graph_id", type=str) @click.argument("chunk_info", nargs=4, type=int) @click.option("--n_threads", type=int, default=1) +@job_type_guard(group_name) def ingest_chunk_local(graph_id: str, chunk_info, n_threads: int): """Manually ingest a chunk on a local machine.""" - from .create.abstract_layers import add_layer - from .cluster import _create_atomic_chunk - - if chunk_info[0] == 2: - _create_atomic_chunk(chunk_info[1:]) + layer, coords = chunk_info[0], chunk_info[1:] + if layer == 2: + create_atomic_chunk(coords) else: cg = ChunkedGraph(graph_id=graph_id) - add_layer(cg, chunk_info[0], chunk_info[1:], n_threads=n_threads) + add_parent_chunk(cg, layer, coords, n_threads=n_threads) + cg = ChunkedGraph(graph_id=graph_id) + add_parent_chunk(cg, layer, coords, n_threads=n_threads) + + +@ingest_cli.command("rate") +@click.argument("layer", type=int) +@click.option("--span", default=10, help="Time span to calculate rate.") +@job_type_guard(group_name) +def rate(layer: int, span: int): + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_completion_rate(imanager, layer, span=span) + + +@ingest_cli.command("run_tests") +@click.argument("graph_id", type=str) +@job_type_guard(group_name) +def run_tests(graph_id): + run_all(ChunkedGraph(graph_id=graph_id)) diff --git a/pychunkedgraph/ingest/cli_upgrade.py b/pychunkedgraph/ingest/cli_upgrade.py new file mode 100644 index 000000000..84939544b --- /dev/null +++ b/pychunkedgraph/ingest/cli_upgrade.py @@ -0,0 +1,156 @@ +# pylint: disable=invalid-name, missing-function-docstring, unspecified-encoding + +""" +cli for running upgrade +""" + +import logging +from time import sleep + +import click +import tensorstore as ts +from flask.cli import AppGroup +from pychunkedgraph import __version__ +from pychunkedgraph.graph.meta import GraphConfig + +from . import IngestConfig +from .cluster import ( + convert_to_ocdbt, + enqueue_l2_tasks, + upgrade_atomic_chunk, + upgrade_parent_chunk, +) +from .manager import IngestionManager +from .utils import ( + chunk_id_str, + print_completion_rate, + print_status, + queue_layer_helper, + start_ocdbt_server, + job_type_guard, +) +from ..graph.chunkedgraph import ChunkedGraph, ChunkedGraphMeta +from ..utils.redis import get_redis_connection +from ..utils.redis import keys as r_keys + +group_name = "upgrade" +upgrade_cli = AppGroup(group_name) + + +def init_upgrade_cmds(app): + app.cli.add_command(upgrade_cli) + + +@upgrade_cli.command("flush_redis") +@click.confirmation_option(prompt="Are you sure you want to flush redis?") +@job_type_guard(group_name) +def flush_redis(): + """FLush redis db.""" + redis = get_redis_connection() + redis.flushdb() + + +@upgrade_cli.command("graph") +@click.argument("graph_id", type=str) +@click.option("--test", is_flag=True, help="Test 8 chunks at the center of dataset.") +@click.option("--ocdbt", is_flag=True, help="Store edges using ts ocdbt kv store.") +@job_type_guard(group_name) +def upgrade_graph(graph_id: str, test: bool, ocdbt: bool): + """ + Main upgrade command. Queues atomic tasks. + """ + redis = get_redis_connection() + redis.set(r_keys.JOB_TYPE, group_name) + ingest_config = IngestConfig(TEST_RUN=test) + cg = ChunkedGraph(graph_id=graph_id) + cg.client.add_graph_version(__version__, overwrite=True) + + if graph_id != cg.graph_id: + gc = cg.meta.graph_config._asdict() + gc["ID"] = graph_id + new_meta = ChunkedGraphMeta( + GraphConfig(**gc), cg.meta.data_source, cg.meta.custom_data + ) + cg.update_meta(new_meta, overwrite=True) + cg = ChunkedGraph(graph_id=graph_id) + + try: + # create new column family for cross chunk edges + f = cg.client._table.column_family("4") + f.create() + except Exception: + ... + + imanager = IngestionManager(ingest_config, cg.meta) + server = ts.ocdbt.DistributedCoordinatorServer() + if ocdbt: + start_ocdbt_server(imanager, server) + + fn = convert_to_ocdbt if ocdbt else upgrade_atomic_chunk + enqueue_l2_tasks(imanager, fn) + + if ocdbt: + logging.info("All tasks queued. Keep this alive for ocdbt coordinator server.") + while True: + sleep(60) + + +@upgrade_cli.command("layer") +@click.argument("parent_layer", type=int) +@job_type_guard(group_name) +def queue_layer(parent_layer): + """ + Queue all chunk tasks at a given layer. + Must be used when all the chunks at `parent_layer - 1` have completed. + """ + assert parent_layer > 2, "This command is for layers 3 and above." + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + queue_layer_helper(parent_layer, imanager, upgrade_parent_chunk) + + +@upgrade_cli.command("status") +@job_type_guard(group_name) +def upgrade_status(): + """Print upgrade status to console.""" + redis = get_redis_connection() + try: + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_status(imanager, redis, upgrade=True) + except TypeError as err: + print(f"\nNo current `{group_name}` job found in redis: {err}") + + +@upgrade_cli.command("chunk") +@click.argument("queue", type=str) +@click.argument("chunk_info", nargs=4, type=int) +@job_type_guard(group_name) +def upgrade_chunk(queue: str, chunk_info): + """Manually queue chunk when a job is stuck for whatever reason.""" + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + layer, coords = chunk_info[0], chunk_info[1:] + + func = upgrade_parent_chunk + args = (layer, coords) + if layer == 2: + func = upgrade_atomic_chunk + args = (coords,) + queue = imanager.get_task_queue(queue) + queue.enqueue( + func, + job_id=chunk_id_str(layer, coords), + job_timeout=f"{int(layer * layer)}m", + result_ttl=0, + args=args, + ) + + +@upgrade_cli.command("rate") +@click.argument("layer", type=int) +@click.option("--span", default=10, help="Time span to calculate rate.") +@job_type_guard(group_name) +def rate(layer: int, span: int): + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + print_completion_rate(imanager, layer, span=span) diff --git a/pychunkedgraph/ingest/cluster.py b/pychunkedgraph/ingest/cluster.py index cf9417024..485251568 100644 --- a/pychunkedgraph/ingest/cluster.py +++ b/pychunkedgraph/ingest/cluster.py @@ -1,105 +1,48 @@ +# pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel + """ -Ingest / create chunkedgraph with workers. +Ingest / create chunkedgraph with workers on a cluster. """ -from typing import Sequence, Tuple +import logging +from os import environ +from time import sleep +from typing import Callable, Dict, Iterable, Tuple, Sequence import numpy as np +from rq import Queue as RQueue + -from .utils import chunk_id_str +from .utils import chunk_id_str, get_chunks_not_done, randomize_grid_points from .manager import IngestionManager -from .common import get_atomic_chunk_data -from .ran_agglomeration import get_active_edges -from .create.atomic_layer import add_atomic_edges -from .create.abstract_layers import add_layer -from ..graph.meta import ChunkedGraphMeta +from .ran_agglomeration import ( + get_active_edges, + read_raw_edge_data, + read_raw_agglomeration_data, +) +from .create.atomic_layer import add_atomic_chunk +from .create.parent_layer import add_parent_chunk +from .upgrade.atomic_layer import update_chunk as update_atomic_chunk +from .upgrade.parent_layer import update_chunk as update_parent_chunk +from ..graph.edges import EDGE_TYPES, Edges, put_edges +from ..graph import ChunkedGraph, ChunkedGraphMeta from ..graph.chunks.hierarchy import get_children_chunk_coords -from ..utils.redis import keys as r_keys -from ..utils.redis import get_redis_connection - +from ..graph.utils.basetypes import NODE_ID +from ..io.edges import get_chunk_edges +from ..io.components import get_chunk_components +from ..utils.redis import keys as r_keys, get_redis_connection +from ..utils.general import chunked -def _post_task_completion(imanager: IngestionManager, layer: int, coords: np.ndarray): - from os import environ +def _post_task_completion( + imanager: IngestionManager, + layer: int, + coords: np.ndarray, +): chunk_str = "_".join(map(str, coords)) # mark chunk as completed - "c" imanager.redis.sadd(f"{layer}c", chunk_str) - if environ.get("DO_NOT_AUTOQUEUE_PARENT_CHUNKS", None) is not None: - return - - parent_layer = layer + 1 - if parent_layer > imanager.cg_meta.layer_count: - return - - parent_coords = np.array(coords, int) // imanager.cg_meta.graph_config.FANOUT - parent_id_str = chunk_id_str(parent_layer, parent_coords) - imanager.redis.sadd(parent_id_str, chunk_str) - - parent_chunk_str = "_".join(map(str, parent_coords)) - if not imanager.redis.hget(parent_layer, parent_chunk_str): - # cache children chunk count - # checked by tracker worker to enqueue parent chunk - children_count = len( - get_children_chunk_coords(imanager.cg_meta, parent_layer, parent_coords) - ) - imanager.redis.hset(parent_layer, parent_chunk_str, children_count) - - tracker_queue = imanager.get_task_queue(f"t{layer}") - tracker_queue.enqueue( - enqueue_parent_task, - job_id=f"t{layer}_{chunk_str}", - job_timeout=f"30s", - result_ttl=0, - args=( - parent_layer, - parent_coords, - ), - ) - - -def enqueue_parent_task( - parent_layer: int, - parent_coords: Sequence[int], -): - redis = get_redis_connection() - imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - parent_id_str = chunk_id_str(parent_layer, parent_coords) - parent_chunk_str = "_".join(map(str, parent_coords)) - - children_done = redis.scard(parent_id_str) - # if zero then this key was deleted and parent already queued. - if children_done == 0: - print("parent already queued.") - return - - # if the previous layer is complete - # no need to check children progress for each parent chunk - child_layer = parent_layer - 1 - child_layer_done = redis.scard(f"{child_layer}c") - child_layer_count = imanager.cg_meta.layer_chunk_counts[child_layer - 2] - child_layer_finished = child_layer_done == child_layer_count - - if not child_layer_finished: - children_count = int(redis.hget(parent_layer, parent_chunk_str).decode("utf-8")) - if children_done != children_count: - print("children not done.") - return - - queue = imanager.get_task_queue(f"l{parent_layer}") - queue.enqueue( - create_parent_chunk, - job_id=parent_id_str, - job_timeout=f"{int(parent_layer * parent_layer)}m", - result_ttl=0, - args=( - parent_layer, - parent_coords, - ), - ) - redis.hdel(parent_layer, parent_chunk_str) - redis.delete(parent_id_str) - def create_parent_chunk( parent_layer: int, @@ -107,7 +50,7 @@ def create_parent_chunk( ) -> None: redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) - add_layer( + add_parent_chunk( imanager.cg, parent_layer, parent_coords, @@ -120,76 +63,173 @@ def create_parent_chunk( _post_task_completion(imanager, parent_layer, parent_coords) -def randomize_grid_points(X: int, Y: int, Z: int) -> Tuple[int, int, int]: - indices = np.arange(X * Y * Z) - np.random.shuffle(indices) - for index in indices: - yield np.unravel_index(index, (X, Y, Z)) - +def upgrade_parent_chunk( + parent_layer: int, + parent_coords: Sequence[int], +) -> None: + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + update_parent_chunk(imanager.cg, parent_coords, layer=parent_layer) + _post_task_completion(imanager, parent_layer, parent_coords) -def enqueue_atomic_tasks(imanager: IngestionManager): - from os import environ - from time import sleep - from rq import Queue as RQueue - chunk_coords = _get_test_chunks(imanager.cg.meta) - chunk_count = len(chunk_coords) - if not imanager.config.TEST_RUN: - atomic_chunk_bounds = imanager.cg_meta.layer_chunk_bounds[2] - chunk_coords = randomize_grid_points(*atomic_chunk_bounds) - chunk_count = imanager.cg_meta.layer_chunk_counts[0] +def _get_atomic_chunk_data( + imanager: IngestionManager, coord: Sequence[int] +) -> Tuple[Dict, Dict]: + """ + Helper to read either raw data or processed data + If reading from raw data, save it as processed data + """ + chunk_edges = ( + read_raw_edge_data(imanager, coord) + if imanager.config.USE_RAW_EDGES + else get_chunk_edges(imanager.cg_meta.data_source.EDGES, [coord]) + ) - print(f"total chunk count: {chunk_count}, queuing...") - batch_size = int(environ.get("L2JOB_BATCH_SIZE", 1000)) + _check_edges_direction(chunk_edges, imanager.cg, coord) - job_datas = [] - for chunk_coord in chunk_coords: - q = imanager.get_task_queue(imanager.config.CLUSTER.ATOMIC_Q_NAME) - # buffer for optimal use of redis memory - if len(q) > imanager.config.CLUSTER.ATOMIC_Q_LIMIT: - print(f"Sleeping {imanager.config.CLUSTER.ATOMIC_Q_INTERVAL}s...") - sleep(imanager.config.CLUSTER.ATOMIC_Q_INTERVAL) - - x, y, z = chunk_coord - chunk_str = f"{x}_{y}_{z}" - if imanager.redis.sismember("2c", chunk_str): - # already done, skip - continue - job_datas.append( - RQueue.prepare_data( - _create_atomic_chunk, - args=(chunk_coord,), - timeout=environ.get("L2JOB_TIMEOUT", "3m"), - result_ttl=0, - job_id=chunk_id_str(2, chunk_coord), - ) - ) - if len(job_datas) % batch_size == 0: - q.enqueue_many(job_datas) - job_datas = [] - q.enqueue_many(job_datas) + mapping = ( + read_raw_agglomeration_data(imanager, coord) + if imanager.config.USE_RAW_COMPONENTS + else get_chunk_components(imanager.cg_meta.data_source.COMPONENTS, coord) + ) + return chunk_edges, mapping -def _create_atomic_chunk(coords: Sequence[int]): +def _check_edges_direction( + chunk_edges: dict, cg: ChunkedGraph, coord: Sequence[int] +) -> None: + """ + For between and cross chunk edges: + Checks and flips edges such that nodes1 are always within a chunk and nodes2 outside the chunk. + Where nodes1 = edges[:,0] and nodes2 = edges[:,1]. + """ + x, y, z = coord + chunk_id = cg.get_chunk_id(layer=1, x=x, y=y, z=z) + for edge_type in [EDGE_TYPES.between_chunk, EDGE_TYPES.cross_chunk]: + edges = chunk_edges[edge_type] + e1 = edges.node_ids1 + e2 = edges.node_ids2 + + e2_chunk_ids = cg.get_chunk_ids_from_node_ids(e2) + mask = e2_chunk_ids == chunk_id + e1[mask], e2[mask] = e2[mask], e1[mask] + + e1_chunk_ids = cg.get_chunk_ids_from_node_ids(e1) + mask = e1_chunk_ids == chunk_id + assert np.all(mask), "all IDs must belong to same chunk" + + +def create_atomic_chunk(coords: Sequence[int]): """Creates single atomic chunk""" redis = get_redis_connection() imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) coords = np.array(list(coords), dtype=int) - chunk_edges_all, mapping = get_atomic_chunk_data(imanager, coords) + + chunk_edges_all, mapping = _get_atomic_chunk_data(imanager, coords) chunk_edges_active, isolated_ids = get_active_edges(chunk_edges_all, mapping) - add_atomic_edges(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) - if imanager.config.TEST_RUN: - # print for debugging - for k, v in chunk_edges_all.items(): - print(k, len(v)) - for k, v in chunk_edges_active.items(): - print(f"active_{k}", len(v)) + add_atomic_chunk(imanager.cg, coords, chunk_edges_active, isolated=isolated_ids) + + for k, v in chunk_edges_all.items(): + logging.debug(f"{k}: {len(v)}") + for k, v in chunk_edges_active.items(): + logging.debug(f"active_{k}: {len(v)}") + _post_task_completion(imanager, 2, coords) + + +def upgrade_atomic_chunk(coords: Sequence[int]): + """Upgrades single atomic chunk""" + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + coords = np.array(list(coords), dtype=int) + update_atomic_chunk(imanager.cg, coords, layer=2) + _post_task_completion(imanager, 2, coords) + + +def convert_to_ocdbt(coords: Sequence[int]): + """ + Convert edges stored per chunk to ajacency list in the tensorstore ocdbt kv store. + """ + redis = get_redis_connection() + imanager = IngestionManager.from_pickle(redis.get(r_keys.INGESTION_MANAGER)) + coords = np.array(list(coords), dtype=int) + chunk_edges_all, mapping = _get_atomic_chunk_data(imanager, coords) + + node_ids1 = [] + node_ids2 = [] + affinities = [] + areas = [] + for edges in chunk_edges_all.values(): + node_ids1.extend(edges.node_ids1) + node_ids2.extend(edges.node_ids2) + affinities.extend(edges.affinities) + areas.extend(edges.areas) + + edges = Edges(node_ids1, node_ids2, affinities=affinities, areas=areas) + nodes = np.concatenate( + [edges.node_ids1, edges.node_ids2, np.fromiter(mapping.keys(), dtype=NODE_ID)] + ) + nodes = np.unique(nodes) + + chunk_id = imanager.cg.get_chunk_id(layer=1, x=coords[0], y=coords[1], z=coords[2]) + chunk_ids = imanager.cg.get_chunk_ids_from_node_ids(nodes) + + host = imanager.redis.get("OCDBT_COORDINATOR_HOST").decode() + port = imanager.redis.get("OCDBT_COORDINATOR_PORT").decode() + environ["OCDBT_COORDINATOR_HOST"] = host + environ["OCDBT_COORDINATOR_PORT"] = port + logging.info(f"OCDBT Coordinator address {host}:{port}") + + put_edges( + f"{imanager.cg.meta.data_source.EDGES}/ocdbt", + nodes[chunk_ids == chunk_id], + edges, + ) _post_task_completion(imanager, 2, coords) def _get_test_chunks(meta: ChunkedGraphMeta): - """Chunks at center of the dataset most likely not to be empty""" + """Chunks at the center most likely not to be empty""" parent_coords = np.array(meta.layer_chunk_bounds[3]) // 2 return get_children_chunk_coords(meta, 3, parent_coords) - # f = lambda r1, r2, r3: np.array(np.meshgrid(r1, r2, r3), dtype=int).T.reshape(-1, 3) - # return f((x, x + 1), (y, y + 1), (z, z + 1)) + + +def _queue_tasks(imanager: IngestionManager, chunk_fn: Callable, coords: Iterable): + queue_name = "l2" + q = imanager.get_task_queue(queue_name) + batch_size = int(environ.get("JOB_BATCH_SIZE", 100000)) + batches = chunked(coords, batch_size) + for batch in batches: + _coords = get_chunks_not_done(imanager, 2, batch) + # buffer for optimal use of redis memory + if len(q) > int(environ.get("QUEUE_SIZE", 100000)): + interval = int(environ.get("QUEUE_INTERVAL", 300)) + logging.info(f"Queue full; sleeping {interval}s...") + sleep(interval) + + job_datas = [] + for chunk_coord in _coords: + job_datas.append( + RQueue.prepare_data( + chunk_fn, + args=(chunk_coord,), + timeout=environ.get("L2JOB_TIMEOUT", "3m"), + result_ttl=0, + job_id=chunk_id_str(2, chunk_coord), + ) + ) + q.enqueue_many(job_datas) + + +def enqueue_l2_tasks(imanager: IngestionManager, chunk_fn: Callable): + """ + `chunk_fn`: function to process a given layer 2 chunk. + """ + chunk_coords = _get_test_chunks(imanager.cg.meta) + chunk_count = len(chunk_coords) + if not imanager.config.TEST_RUN: + atomic_chunk_bounds = imanager.cg_meta.layer_chunk_bounds[2] + chunk_coords = randomize_grid_points(*atomic_chunk_bounds) + chunk_count = imanager.cg_meta.layer_chunk_counts[0] + logging.info(f"Chunk count: {chunk_count}, queuing...") + _queue_tasks(imanager, chunk_fn, chunk_coords) diff --git a/pychunkedgraph/ingest/common.py b/pychunkedgraph/ingest/common.py deleted file mode 100644 index dccf58602..000000000 --- a/pychunkedgraph/ingest/common.py +++ /dev/null @@ -1,61 +0,0 @@ -from typing import Dict -from typing import Tuple -from typing import Sequence - -from .manager import IngestionManager -from .ran_agglomeration import read_raw_edge_data -from .ran_agglomeration import read_raw_agglomeration_data -from ..graph import ChunkedGraph -from ..io.edges import get_chunk_edges -from ..io.components import get_chunk_components - - -def get_atomic_chunk_data( - imanager: IngestionManager, coord: Sequence[int] -) -> Tuple[Dict, Dict]: - """ - Helper to read either raw data or processed data - If reading from raw data, save it as processed data - """ - chunk_edges = ( - read_raw_edge_data(imanager, coord) - if imanager.config.USE_RAW_EDGES - else get_chunk_edges(imanager.cg_meta.data_source.EDGES, [coord]) - ) - - _check_edges_direction(chunk_edges, imanager.cg, coord) - - mapping = ( - read_raw_agglomeration_data(imanager, coord) - if imanager.config.USE_RAW_COMPONENTS - else get_chunk_components(imanager.cg_meta.data_source.COMPONENTS, coord) - ) - return chunk_edges, mapping - - -def _check_edges_direction( - chunk_edges: dict, cg: ChunkedGraph, coord: Sequence[int] -) -> None: - """ - For between and cross chunk edges: - Checks and flips edges such that nodes1 are always within a chunk and nodes2 outside the chunk. - Where nodes1 = edges[:,0] and nodes2 = edges[:,1]. - """ - import numpy as np - from ..graph.edges import Edges - from ..graph.edges import EDGE_TYPES - - x, y, z = coord - chunk_id = cg.get_chunk_id(layer=1, x=x, y=y, z=z) - for edge_type in [EDGE_TYPES.between_chunk, EDGE_TYPES.cross_chunk]: - edges = chunk_edges[edge_type] - e1 = edges.node_ids1 - e2 = edges.node_ids2 - - e2_chunk_ids = cg.get_chunk_ids_from_node_ids(e2) - mask = e2_chunk_ids == chunk_id - e1[mask], e2[mask] = e2[mask], e1[mask] - - e1_chunk_ids = cg.get_chunk_ids_from_node_ids(e1) - mask = e1_chunk_ids == chunk_id - assert np.all(mask), "all IDs must belong to same chunk" diff --git a/pychunkedgraph/ingest/create/abstract_layers.py b/pychunkedgraph/ingest/create/abstract_layers.py deleted file mode 100644 index 529a6846f..000000000 --- a/pychunkedgraph/ingest/create/abstract_layers.py +++ /dev/null @@ -1,247 +0,0 @@ -""" -Functions for creating parents in level 3 and above -""" - -import time -import math -import datetime -import multiprocessing as mp -from collections import defaultdict -from typing import Optional -from typing import Sequence -from typing import List - -import numpy as np -from multiwrapper import multiprocessing_utils as mu - -from ...graph import types -from ...graph import attributes -from ...utils.general import chunked -from ...graph.utils import flatgraph -from ...graph.utils import basetypes -from ...graph.utils import serializers -from ...graph.chunkedgraph import ChunkedGraph -from ...graph.utils.generic import get_valid_timestamp -from ...graph.utils.generic import filter_failed_node_ids -from ...graph.chunks.hierarchy import get_children_chunk_coords -from ...graph.connectivity.cross_edges import get_children_chunk_cross_edges -from ...graph.connectivity.cross_edges import get_chunk_nodes_cross_edge_layer - - -def add_layer( - cg: ChunkedGraph, - layer_id: int, - parent_coords: Sequence[int], - children_coords: Sequence[Sequence[int]] = np.array([]), - *, - time_stamp: Optional[datetime.datetime] = None, - n_threads: int = 4, -) -> None: - if not children_coords.size: - children_coords = get_children_chunk_coords(cg.meta, layer_id, parent_coords) - children_ids = _read_children_chunks(cg, layer_id, children_coords, n_threads > 1) - edge_ids = get_children_chunk_cross_edges( - cg, layer_id, parent_coords, use_threads=n_threads > 1 - ) - - print("children_coords", children_coords.size, layer_id, parent_coords) - print( - "n e", len(children_ids), len(edge_ids), layer_id, parent_coords, - ) - - node_layers = cg.get_chunk_layers(children_ids) - edge_layers = cg.get_chunk_layers(np.unique(edge_ids)) - assert np.all(node_layers < layer_id), "invalid node layers" - assert np.all(edge_layers < layer_id), "invalid edge layers" - # Extract connected components - # isolated_node_mask = ~np.in1d(children_ids, np.unique(edge_ids)) - # add_node_ids = children_ids[isolated_node_mask].squeeze() - add_edge_ids = np.vstack([children_ids, children_ids]).T - - edge_ids = list(edge_ids) - edge_ids.extend(add_edge_ids) - graph, _, _, graph_ids = flatgraph.build_gt_graph(edge_ids, make_directed=True) - ccs = flatgraph.connected_components(graph) - print("ccs", len(ccs)) - _write_connected_components( - cg, - layer_id, - parent_coords, - ccs, - graph_ids, - get_valid_timestamp(time_stamp), - n_threads > 1, - ) - return f"{layer_id}_{'_'.join(map(str, parent_coords))}" - - -def _read_children_chunks( - cg: ChunkedGraph, layer_id, children_coords, use_threads=True -): - if not use_threads: - children_ids = [types.empty_1d] - for child_coord in children_coords: - children_ids.append(_read_chunk([], cg, layer_id - 1, child_coord)) - return np.concatenate(children_ids) - - print("_read_children_chunks") - with mp.Manager() as manager: - children_ids_shared = manager.list() - multi_args = [] - for child_coord in children_coords: - multi_args.append( - ( - children_ids_shared, - cg.get_serialized_info(), - layer_id - 1, - child_coord, - ) - ) - mu.multiprocess_func( - _read_chunk_helper, - multi_args, - n_threads=min(len(multi_args), mp.cpu_count()), - ) - print("_read_children_chunks done") - return np.concatenate(children_ids_shared) - - -def _read_chunk_helper(args): - children_ids_shared, cg_info, layer_id, chunk_coord = args - cg = ChunkedGraph(**cg_info) - _read_chunk(children_ids_shared, cg, layer_id, chunk_coord) - - -def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coord): - print(f"_read_chunk {layer_id}, {chunk_coord}") - x, y, z = chunk_coord - range_read = cg.range_read_chunk( - cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z), - properties=attributes.Hierarchy.Child, - ) - row_ids = [] - max_children_ids = [] - for row_id, row_data in range_read.items(): - row_ids.append(row_id) - max_children_ids.append(np.max(row_data[0].value)) - row_ids = np.array(row_ids, dtype=basetypes.NODE_ID) - segment_ids = np.array([cg.get_segment_id(r_id) for r_id in row_ids]) - - row_ids = filter_failed_node_ids(row_ids, segment_ids, max_children_ids) - children_ids_shared.append(row_ids) - print(f"_read_chunk {layer_id}, {chunk_coord} done {len(row_ids)}") - return row_ids - - -def _write_connected_components( - cg: ChunkedGraph, - layer_id: int, - parent_coords, - ccs, - graph_ids, - time_stamp, - use_threads=True, -) -> None: - if not ccs: - return - - node_layer_d_shared = {} - if layer_id < cg.meta.layer_count: - print("getting node_layer_d_shared") - node_layer_d_shared = get_chunk_nodes_cross_edge_layer( - cg, layer_id, parent_coords, use_threads=use_threads - ) - - print("node_layer_d_shared", len(node_layer_d_shared)) - - ccs_with_node_ids = [] - for cc in ccs: - ccs_with_node_ids.append(graph_ids[cc]) - - if not use_threads: - _write( - cg, - layer_id, - parent_coords, - ccs_with_node_ids, - node_layer_d_shared, - time_stamp, - use_threads=use_threads, - ) - return - - task_size = int(math.ceil(len(ccs_with_node_ids) / mp.cpu_count() / 10)) - chunked_ccs = chunked(ccs_with_node_ids, task_size) - cg_info = cg.get_serialized_info() - multi_args = [] - for ccs in chunked_ccs: - multi_args.append( - (cg_info, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp) - ) - mu.multiprocess_func( - _write_components_helper, - multi_args, - n_threads=min(len(multi_args), mp.cpu_count()), - ) - - -def _write_components_helper(args): - print("running _write_components_helper") - cg_info, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp = args - cg = ChunkedGraph(**cg_info) - _write(cg, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp) - - -def _write( - cg, layer_id, parent_coords, ccs, node_layer_d_shared, time_stamp, use_threads=True -): - parent_layer_ids = range(layer_id, cg.meta.layer_count + 1) - cc_connections = {l: [] for l in parent_layer_ids} - for node_ids in ccs: - layer = layer_id - if len(node_ids) == 1: - layer = node_layer_d_shared.get(node_ids[0], cg.meta.layer_count) - cc_connections[layer].append(node_ids) - - rows = [] - x, y, z = parent_coords - parent_chunk_id = cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z) - parent_chunk_id_dict = cg.get_parent_chunk_id_dict(parent_chunk_id) - - # Iterate through layers - for parent_layer_id in parent_layer_ids: - if len(cc_connections[parent_layer_id]) == 0: - continue - - parent_chunk_id = parent_chunk_id_dict[parent_layer_id] - reserved_parent_ids = cg.id_client.create_node_ids( - parent_chunk_id, - size=len(cc_connections[parent_layer_id]), - root_chunk=parent_layer_id == cg.meta.layer_count and use_threads, - ) - - for i_cc, node_ids in enumerate(cc_connections[parent_layer_id]): - parent_id = reserved_parent_ids[i_cc] - for node_id in node_ids: - rows.append( - cg.client.mutate_row( - serializers.serialize_uint64(node_id), - {attributes.Hierarchy.Parent: parent_id}, - time_stamp=time_stamp, - ) - ) - - rows.append( - cg.client.mutate_row( - serializers.serialize_uint64(parent_id), - {attributes.Hierarchy.Child: node_ids}, - time_stamp=time_stamp, - ) - ) - - if len(rows) > 100000: - cg.client.write(rows) - print("wrote rows", len(rows), layer_id, parent_coords) - rows = [] - cg.client.write(rows) - print("wrote rows", len(rows), layer_id, parent_coords) diff --git a/pychunkedgraph/ingest/create/atomic_layer.py b/pychunkedgraph/ingest/create/atomic_layer.py index 4fa1f1688..0a7aae728 100644 --- a/pychunkedgraph/ingest/create/atomic_layer.py +++ b/pychunkedgraph/ingest/create/atomic_layer.py @@ -1,14 +1,14 @@ +# pylint: disable=invalid-name, missing-function-docstring, import-outside-toplevel + """ Functions for creating atomic nodes and their level 2 abstract parents """ import datetime from typing import Dict -from typing import List from typing import Optional from typing import Sequence -import pytz import numpy as np from ...graph import attributes @@ -23,9 +23,9 @@ from ...graph.utils.flatgraph import connected_components -def add_atomic_edges( +def add_atomic_chunk( cg: ChunkedGraph, - chunk_coord: np.ndarray, + coords: Sequence[int], chunk_edges_d: Dict[str, Edges], isolated: Sequence[int], time_stamp: Optional[datetime.datetime] = None, @@ -40,9 +40,7 @@ def add_atomic_edges( graph, _, _, unique_ids = build_gt_graph(chunk_edge_ids, make_directed=True) ccs = connected_components(graph) - parent_chunk_id = cg.get_chunk_id( - layer=2, x=chunk_coord[0], y=chunk_coord[1], z=chunk_coord[2] - ) + parent_chunk_id = cg.get_chunk_id(layer=2, x=coords[0], y=coords[1], z=coords[2]) parent_ids = cg.id_client.create_node_ids(parent_chunk_id, size=len(ccs)) sparse_indices, remapping = _get_remapping(chunk_edges_d) @@ -101,7 +99,13 @@ def _get_remapping(chunk_edges_d: dict): def _process_component( - cg, chunk_edges_d, parent_id, node_ids, sparse_indices, remapping, time_stamp, + cg, + chunk_edges_d, + parent_id, + node_ids, + sparse_indices, + remapping, + time_stamp, ): nodes = [] chunk_out_edges = [] # out = between + cross @@ -120,7 +124,7 @@ def _process_component( for cc_layer in u_cce_layers: layer_out_edges = chunk_out_edges[cce_layers == cc_layer] if layer_out_edges.size: - col = attributes.Connectivity.CrossChunkEdge[cc_layer] + col = attributes.Connectivity.AtomicCrossChunkEdge[cc_layer] val_dict[col] = layer_out_edges r_key = serializers.serialize_uint64(parent_id) diff --git a/pychunkedgraph/graph/connectivity/cross_edges.py b/pychunkedgraph/ingest/create/cross_edges.py similarity index 61% rename from pychunkedgraph/graph/connectivity/cross_edges.py rename to pychunkedgraph/ingest/create/cross_edges.py index 8aa52a9f1..9581838af 100644 --- a/pychunkedgraph/graph/connectivity/cross_edges.py +++ b/pychunkedgraph/ingest/create/cross_edges.py @@ -1,43 +1,38 @@ -import time +# pylint: disable=invalid-name, missing-docstring + import math import multiprocessing as mp from collections import defaultdict -from typing import Optional from typing import Sequence -from typing import List from typing import Dict import numpy as np from multiwrapper.multiprocessing_utils import multiprocess_func -from .. import attributes -from ..types import empty_2d -from ..utils import basetypes -from ..utils import serializers -from ..chunkedgraph import ChunkedGraph -from ..utils.generic import get_valid_timestamp -from ..utils.generic import filter_failed_node_ids -from ..chunks.atomic import get_touching_atomic_chunks -from ..chunks.atomic import get_bounding_atomic_chunks +from ...graph import attributes +from ...graph.types import empty_2d +from ...graph.utils import basetypes +from ...graph.chunkedgraph import ChunkedGraph +from ...graph.utils.generic import filter_failed_node_ids +from ...graph.chunks.atomic import get_touching_atomic_chunks +from ...graph.chunks.atomic import get_bounding_atomic_chunks from ...utils.general import chunked def get_children_chunk_cross_edges( - cg, layer, chunk_coord, *, use_threads=True + cg: ChunkedGraph, layer, chunk_coord, *, use_threads=True ) -> np.ndarray: """ Cross edges that connect children chunks. - The edges are between node IDs in the given layer (not atomic). + The edges are between node IDs in the given layer. """ atomic_chunks = get_touching_atomic_chunks(cg.meta, layer, chunk_coord) - if not len(atomic_chunks): + if len(atomic_chunks) == 0: return [] - print(f"touching atomic chunk count {len(atomic_chunks)}") if not use_threads: return _get_children_chunk_cross_edges(cg, atomic_chunks, layer - 1) - print("get_children_chunk_cross_edges, atomic chunks", len(atomic_chunks)) with mp.Manager() as manager: edge_ids_shared = manager.list() edge_ids_shared.append(empty_2d) @@ -68,10 +63,12 @@ def _get_children_chunk_cross_edges_helper(args) -> None: edge_ids_shared.append(_get_children_chunk_cross_edges(cg, atomic_chunks, layer)) -def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: - print( - f"_get_children_chunk_cross_edges {layer} atomic_chunks count {len(atomic_chunks)}" - ) +def _get_children_chunk_cross_edges(cg: ChunkedGraph, atomic_chunks, layer) -> np.ndarray: + """ + Non parallelized version + Cross edges that connect children chunks. + The edges are between node IDs in the given layer (not atomic). + """ cross_edges = [empty_2d] for layer2_chunk in atomic_chunks: edges = _read_atomic_chunk_cross_edges(cg, layer2_chunk, layer) @@ -80,18 +77,21 @@ def _get_children_chunk_cross_edges(cg, atomic_chunks, layer) -> None: cross_edges = np.concatenate(cross_edges) if not cross_edges.size: return empty_2d - print(f"getting roots at stop_layer {layer} {cross_edges.shape}") + cross_edges[:, 0] = cg.get_roots(cross_edges[:, 0], stop_layer=layer, ceil=False) cross_edges[:, 1] = cg.get_roots(cross_edges[:, 1], stop_layer=layer, ceil=False) result = np.unique(cross_edges, axis=0) if cross_edges.size else empty_2d - print(f"_get_children_chunk_cross_edges done {result.shape}") return result def _read_atomic_chunk_cross_edges( - cg, chunk_coord: Sequence[int], cross_edge_layer: int + cg: ChunkedGraph, chunk_coord: Sequence[int], cross_edge_layer: int ) -> np.ndarray: - cross_edge_col = attributes.Connectivity.CrossChunkEdge[cross_edge_layer] + """ + Returns cross edges between l2 nodes in current chunk and + l1 supervoxels from neighbor chunks. + """ + cross_edge_col = attributes.Connectivity.AtomicCrossChunkEdge[cross_edge_layer] range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, [cross_edge_layer]) parent_neighboring_chunk_supervoxels_d = defaultdict(list) @@ -102,8 +102,7 @@ def _read_atomic_chunk_cross_edges( parent_neighboring_chunk_supervoxels_d[l2id] = edges[:, 1] cross_edges = [empty_2d] - for l2id in parent_neighboring_chunk_supervoxels_d: - nebor_svs = parent_neighboring_chunk_supervoxels_d[l2id] + for l2id, nebor_svs in parent_neighboring_chunk_supervoxels_d.items(): chunk_parent_ids = np.array([l2id] * len(nebor_svs), dtype=basetypes.NODE_ID) cross_edges.append(np.vstack([chunk_parent_ids, nebor_svs]).T) cross_edges = np.concatenate(cross_edges) @@ -111,35 +110,31 @@ def _read_atomic_chunk_cross_edges( def get_chunk_nodes_cross_edge_layer( - cg, layer: int, chunk_coord: Sequence[int], use_threads=True + cg: ChunkedGraph, layer: int, chunk_coord: Sequence[int], use_threads=True ) -> Dict: """ gets nodes in a chunk that are part of cross chunk edges return_type dict {node_id: layer} the lowest layer (>= current layer) at which a node_id is part of a cross edge """ - print("get_bounding_atomic_chunks") atomic_chunks = get_bounding_atomic_chunks(cg.meta, layer, chunk_coord) - print("get_bounding_atomic_chunks complete") - if not len(atomic_chunks): + if len(atomic_chunks) == 0: return {} if not use_threads: return _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer) - print("divide tasks") cg_info = cg.get_serialized_info() manager = mp.Manager() - ids_l_shared = manager.list() - layers_l_shared = manager.list() + node_ids_shared = manager.list() + node_layers_shared = manager.list() task_size = int(math.ceil(len(atomic_chunks) / mp.cpu_count() / 10)) chunked_l2chunk_list = chunked(atomic_chunks, task_size) multi_args = [] for atomic_chunks in chunked_l2chunk_list: multi_args.append( - (ids_l_shared, layers_l_shared, cg_info, atomic_chunks, layer) + (node_ids_shared, node_layers_shared, cg_info, atomic_chunks, layer) ) - print("divide tasks complete") multiprocess_func( _get_chunk_nodes_cross_edge_layer_helper, @@ -148,24 +143,29 @@ def get_chunk_nodes_cross_edge_layer( ) node_layer_d_shared = manager.dict() - _find_min_layer(node_layer_d_shared, ids_l_shared, layers_l_shared) - print("_find_min_layer complete") + _find_min_layer(node_layer_d_shared, node_ids_shared, node_layers_shared) return node_layer_d_shared def _get_chunk_nodes_cross_edge_layer_helper(args): - ids_l_shared, layers_l_shared, cg_info, atomic_chunks, layer = args + node_ids_shared, node_layers_shared, cg_info, atomic_chunks, layer = args cg = ChunkedGraph(**cg_info) node_layer_d = _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer) - ids_l_shared.append(np.fromiter(node_layer_d.keys(), dtype=basetypes.NODE_ID)) - layers_l_shared.append(np.fromiter(node_layer_d.values(), dtype=np.uint8)) + node_ids_shared.append(np.fromiter(node_layer_d.keys(), dtype=basetypes.NODE_ID)) + node_layers_shared.append(np.fromiter(node_layer_d.values(), dtype=np.uint8)) -def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): +def _get_chunk_nodes_cross_edge_layer(cg: ChunkedGraph, atomic_chunks, layer): + """ + Non parallelized version + gets nodes in a chunk that are part of cross chunk edges + return_type dict {node_id: layer} + the lowest layer (>= current layer) at which a node_id is part of a cross edge + """ atomic_node_layer_d = {} for atomic_chunk in atomic_chunks: chunk_node_layer_d = _read_atomic_chunk_cross_edge_nodes( - cg, atomic_chunk, range(layer, cg.meta.layer_count + 1) + cg, atomic_chunk, layer ) atomic_node_layer_d.update(chunk_node_layer_d) @@ -179,32 +179,57 @@ def _get_chunk_nodes_cross_edge_layer(cg, atomic_chunks, layer): return node_layer_d -def _read_atomic_chunk_cross_edge_nodes(cg, chunk_coord, cross_edge_layers): +def _read_atomic_chunk_cross_edge_nodes(cg: ChunkedGraph, chunk_coord, layer): + """ + the lowest layer at which an l2 node is part of a cross edge + """ node_layer_d = {} - range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, cross_edge_layers) + relevant_layers = range(layer, cg.meta.layer_count) + range_read, l2ids = _read_atomic_chunk(cg, chunk_coord, relevant_layers) for l2id in l2ids: - for layer in cross_edge_layers: - if attributes.Connectivity.CrossChunkEdge[layer] in range_read[l2id]: + for layer in relevant_layers: + if attributes.Connectivity.AtomicCrossChunkEdge[layer] in range_read[l2id]: node_layer_d[l2id] = layer break return node_layer_d -def _find_min_layer(node_layer_d_shared, ids_l_shared, layers_l_shared): - node_ids = np.concatenate(ids_l_shared) - layers = np.concatenate(layers_l_shared) +def _find_min_layer(node_layer_d_shared, node_ids_shared, node_layers_shared): + """ + `node_layer_d_shared`: DictProxy + + `node_ids_shared`: ListProxy + + `node_layers_shared`: ListProxy + + Due to parallelization, there will be multiple values for min_layer of a node. + We need to find the global min_layer after all multiprocesses return. + For eg: + At some indices p and q, there will be a node_id x + i.e. `node_ids_shared[p] == node_ids_shared[q]` + + and node_layers_shared[p] != node_layers_shared[q] + so we need: + `node_layer_d_shared[x] = min(node_layers_shared[p], node_layers_shared[q])` + """ + node_ids = np.concatenate(node_ids_shared) + layers = np.concatenate(node_layers_shared) for i, node_id in enumerate(node_ids): layer = node_layer_d_shared.get(node_id, layers[i]) node_layer_d_shared[node_id] = min(layer, layers[i]) -def _read_atomic_chunk(cg, chunk_coord, layers): +def _read_atomic_chunk(cg: ChunkedGraph, chunk_coord, layers): + """ + read entire atomic chunk; all nodes and their relevant cross edges + filter out invalid nodes generated by failed tasks + """ x, y, z = chunk_coord child_col = attributes.Hierarchy.Child range_read = cg.range_read_chunk( cg.get_chunk_id(layer=2, x=x, y=y, z=z), properties=[child_col] - + [attributes.Connectivity.CrossChunkEdge[l] for l in layers], + + [attributes.Connectivity.AtomicCrossChunkEdge[l] for l in layers], ) row_ids = [] diff --git a/pychunkedgraph/ingest/create/parent_layer.py b/pychunkedgraph/ingest/create/parent_layer.py new file mode 100644 index 000000000..90b24d26a --- /dev/null +++ b/pychunkedgraph/ingest/create/parent_layer.py @@ -0,0 +1,247 @@ +# pylint: disable=invalid-name, missing-docstring, import-outside-toplevel, c-extension-no-member + +""" +Functions for creating parents in level 3 and above +""" + +import math +import datetime +import multiprocessing as mp +from typing import Optional +from typing import Sequence + +import fastremap +import numpy as np +from multiwrapper import multiprocessing_utils as mu + +from ...graph import types +from ...graph import attributes +from ...utils.general import chunked +from ...graph.utils import flatgraph +from ...graph.utils import basetypes +from ...graph.utils import serializers +from ...graph.chunkedgraph import ChunkedGraph +from ...graph.edges.utils import concatenate_cross_edge_dicts +from ...graph.utils.generic import get_valid_timestamp +from ...graph.utils.generic import filter_failed_node_ids +from ...graph.chunks.hierarchy import get_children_chunk_coords +from .cross_edges import get_children_chunk_cross_edges +from .cross_edges import get_chunk_nodes_cross_edge_layer + + +def add_parent_chunk( + cg: ChunkedGraph, + layer_id: int, + coords: Sequence[int], + children_coords: Sequence[Sequence[int]] = np.array([]), + *, + time_stamp: Optional[datetime.datetime] = None, + n_threads: int = 4, +) -> None: + if not children_coords.size: + children_coords = get_children_chunk_coords(cg.meta, layer_id, coords) + children_ids = _read_children_chunks(cg, layer_id, children_coords, n_threads > 1) + cx_edges = get_children_chunk_cross_edges( + cg, layer_id, coords, use_threads=n_threads > 1 + ) + + node_layers = cg.get_chunk_layers(children_ids) + edge_layers = cg.get_chunk_layers(np.unique(cx_edges)) + assert np.all(node_layers < layer_id), "invalid node layers" + assert np.all(edge_layers < layer_id), "invalid edge layers" + + cx_edges = list(cx_edges) + cx_edges.extend(np.vstack([children_ids, children_ids]).T) # add self-edges + graph, _, _, graph_ids = flatgraph.build_gt_graph(cx_edges, make_directed=True) + raw_ccs = flatgraph.connected_components(graph) # connected components with indices + connected_components = [graph_ids[cc] for cc in raw_ccs] + + _write_connected_components( + cg, + layer_id, + coords, + connected_components, + get_valid_timestamp(time_stamp), + n_threads > 1, + ) + + +def _read_children_chunks( + cg: ChunkedGraph, layer_id, children_coords, use_threads=True +): + if not use_threads: + children_ids = [types.empty_1d] + for child_coord in children_coords: + children_ids.append(_read_chunk([], cg, layer_id - 1, child_coord)) + return np.concatenate(children_ids) + + with mp.Manager() as manager: + children_ids_shared = manager.list() + multi_args = [] + for child_coord in children_coords: + multi_args.append( + ( + children_ids_shared, + cg.get_serialized_info(), + layer_id - 1, + child_coord, + ) + ) + mu.multiprocess_func( + _read_chunk_helper, + multi_args, + n_threads=min(len(multi_args), mp.cpu_count()), + ) + return np.concatenate(children_ids_shared) + + +def _read_chunk_helper(args): + children_ids_shared, cg_info, layer_id, chunk_coord = args + cg = ChunkedGraph(**cg_info) + _read_chunk(children_ids_shared, cg, layer_id, chunk_coord) + + +def _read_chunk(children_ids_shared, cg: ChunkedGraph, layer_id: int, chunk_coord): + x, y, z = chunk_coord + range_read = cg.range_read_chunk( + cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z), + properties=attributes.Hierarchy.Child, + ) + row_ids = [] + max_children_ids = [] + for row_id, row_data in range_read.items(): + row_ids.append(row_id) + max_children_ids.append(np.max(row_data[0].value)) + row_ids = np.array(row_ids, dtype=basetypes.NODE_ID) + segment_ids = np.array([cg.get_segment_id(r_id) for r_id in row_ids]) + + row_ids = filter_failed_node_ids(row_ids, segment_ids, max_children_ids) + children_ids_shared.append(row_ids) + return row_ids + + +def _write_connected_components( + cg, layer, pcoords, components, time_stamp, use_threads=True +): + if len(components) == 0: + return + + node_layer_d = {} + if layer < cg.meta.layer_count: + node_layer_d = get_chunk_nodes_cross_edge_layer(cg, layer, pcoords, use_threads) + + if not use_threads: + _write(cg, layer, pcoords, components, node_layer_d, time_stamp, use_threads) + return + + task_size = int(math.ceil(len(components) / mp.cpu_count() / 10)) + chunked_ccs = chunked(components, task_size) + cg_info = cg.get_serialized_info() + multi_args = [] + for ccs in chunked_ccs: + args = (cg_info, layer, pcoords, ccs, node_layer_d, time_stamp) + multi_args.append(args) + mu.multiprocess_func( + _write_components_helper, + multi_args, + n_threads=min(len(multi_args), mp.cpu_count()), + ) + + +def _write_components_helper(args): + cg_info, layer, pcoords, ccs, node_layer_d, time_stamp = args + cg = ChunkedGraph(**cg_info) + _write(cg, layer, pcoords, ccs, node_layer_d, time_stamp) + + +def _children_rows( + cg: ChunkedGraph, parent_id, children: Sequence, cx_edges_d: dict, time_stamp +): + """ + Update children rows to point to the parent_id, collect cached children + cross chunk edges to lift and update parent cross chunk edges. + Returns list of mutations to children and list of children cross edges. + """ + rows = [] + children_cx_edges = [] + children_layers = cg.get_chunk_layers(children) + for child, node_layer in zip(children, children_layers): + node_layer = cg.get_chunk_layer(child) + row_id = serializers.serialize_uint64(child) + val_dict = {attributes.Hierarchy.Parent: parent_id} + node_cx_edges_d = cx_edges_d.get(child, {}) + if not node_cx_edges_d: + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + continue + for layer in range(node_layer, cg.meta.layer_count): + if not layer in node_cx_edges_d: + continue + layer_edges = node_cx_edges_d[layer] + nodes = np.unique(layer_edges) + parents = cg.get_roots(nodes, stop_layer=node_layer, ceil=False) + edge_parents_d = dict(zip(nodes, parents)) + layer_edges = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + layer_edges = np.unique(layer_edges, axis=0) + col = attributes.Connectivity.CrossChunkEdge[layer] + val_dict[col] = layer_edges + node_cx_edges_d[layer] = layer_edges + children_cx_edges.append(node_cx_edges_d) + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp)) + return rows, children_cx_edges + + +def _write( + cg: ChunkedGraph, + layer_id, + parent_coords, + components, + node_layer_d, + ts, + use_threads=True, +): + parent_layers = range(layer_id, cg.meta.layer_count + 1) + cc_connections = {l: [] for l in parent_layers} + for node_ids in components: + layer = layer_id + if len(node_ids) == 1: + layer = node_layer_d.get(node_ids[0], cg.meta.layer_count) + cc_connections[layer].append(node_ids) + + rows = [] + x, y, z = parent_coords + parent_chunk_id = cg.get_chunk_id(layer=layer_id, x=x, y=y, z=z) + parent_chunk_id_dict = cg.get_parent_chunk_id_dict(parent_chunk_id) + for parent_layer in parent_layers: + if len(cc_connections[parent_layer]) == 0: + continue + parent_chunk_id = parent_chunk_id_dict[parent_layer] + reserved_parent_ids = cg.id_client.create_node_ids( + parent_chunk_id, + size=len(cc_connections[parent_layer]), + root_chunk=parent_layer == cg.meta.layer_count and use_threads, + ) + for i_cc, children in enumerate(cc_connections[parent_layer]): + parent = reserved_parent_ids[i_cc] + if layer_id == 3: + # when layer 3 is being processed, children chunks are at layer 2 + # layer 2 chunks at this time will only have atomic cross edges + cx_edges_d = cg.get_atomic_cross_edges(children) + else: + cx_edges_d = cg.get_cross_chunk_edges(children, raw_only=True) + _rows, cx_edges = _children_rows(cg, parent, children, cx_edges_d, ts) + rows.extend(_rows) + row_id = serializers.serialize_uint64(parent) + val_dict = {attributes.Hierarchy.Child: children} + parent_cx_edges_d = concatenate_cross_edge_dicts(cx_edges, unique=True) + for layer in range(parent_layer, cg.meta.layer_count): + if not layer in parent_cx_edges_d: + continue + col = attributes.Connectivity.CrossChunkEdge[layer] + val_dict[col] = parent_cx_edges_d[layer] + rows.append(cg.client.mutate_row(row_id, val_dict, ts)) + if len(rows) > 100000: + cg.client.write(rows) + rows = [] + cg.client.write(rows) diff --git a/pychunkedgraph/ingest/manager.py b/pychunkedgraph/ingest/manager.py index f5f870810..55e7d253f 100644 --- a/pychunkedgraph/ingest/manager.py +++ b/pychunkedgraph/ingest/manager.py @@ -1,3 +1,5 @@ +# pylint: disable=invalid-name, missing-docstring + import pickle from . import IngestConfig @@ -15,7 +17,9 @@ def __init__(self, config: IngestConfig, chunkedgraph_meta: ChunkedGraphMeta): self._cg = None self._redis = None self._task_queues = {} - self.redis # initiate and cache info + + # initiate redis and cache info + self.redis # pylint: disable=pointless-statement @property def config(self): diff --git a/pychunkedgraph/ingest/ran_agglomeration.py b/pychunkedgraph/ingest/ran_agglomeration.py index 7c4af51f7..a0ca42d54 100644 --- a/pychunkedgraph/ingest/ran_agglomeration.py +++ b/pychunkedgraph/ingest/ran_agglomeration.py @@ -5,10 +5,7 @@ from collections import defaultdict from itertools import product -from typing import Dict -from typing import Iterable -from typing import Tuple -from typing import Union +from typing import Dict, Iterable, Tuple, Union from binascii import crc32 @@ -23,8 +20,7 @@ from ..io.edges import put_chunk_edges from ..io.components import put_chunk_components from ..graph.utils import basetypes -from ..graph.edges import Edges -from ..graph.edges import EDGE_TYPES +from ..graph.edges import EDGE_TYPES, Edges from ..graph.types import empty_2d from ..graph.chunks.utils import get_chunk_id diff --git a/pychunkedgraph/ingest/rq_cli.py b/pychunkedgraph/ingest/rq_cli.py index 27b9c865d..6a1a4882d 100644 --- a/pychunkedgraph/ingest/rq_cli.py +++ b/pychunkedgraph/ingest/rq_cli.py @@ -1,20 +1,18 @@ +# pylint: disable=invalid-name, missing-function-docstring + """ cli for redis jobs """ -import os import sys import click from redis import Redis from rq import Queue -from rq import Worker -from rq.worker import WorkerStatus from rq.job import Job from rq.exceptions import InvalidJobOperationError from rq.exceptions import NoSuchJobError from rq.registry import StartedJobRegistry from rq.registry import FailedJobRegistry -from flask import current_app from flask.cli import AppGroup from ..utils.redis import REDIS_HOST @@ -27,23 +25,6 @@ connection = Redis(host=REDIS_HOST, port=REDIS_PORT, db=0, password=REDIS_PASSWORD) -@rq_cli.command("status") -@click.argument("queues", nargs=-1, type=str) -@click.option("--show-busy", is_flag=True) -def get_status(queues, show_busy): - print("NOTE: Use --show-busy to display count of non idle workers\n") - for queue in queues: - q = Queue(queue, connection=connection) - print(f"Queue name \t: {queue}") - print(f"Jobs queued \t: {len(q)}") - print(f"Workers total \t: {Worker.count(queue=q)}") - if show_busy: - workers = Worker.all(queue=q) - count = sum([worker.get_state() == WorkerStatus.BUSY for worker in workers]) - print(f"Workers busy \t: {count}") - print(f"Jobs failed \t: {q.failed_job_registry.count}\n") - - @rq_cli.command("failed") @click.argument("queue", type=str) @click.argument("job_ids", nargs=-1) @@ -129,9 +110,14 @@ def clean_start_registry(queue): def clear_failed_registry(queue): failed_job_registry = FailedJobRegistry(queue, connection=connection) job_ids = failed_job_registry.get_job_ids() + count = 0 for job_id in job_ids: - failed_job_registry.remove(job_id, delete_job=True) - print(f"Deleted {len(job_ids)} jobs from the failed job registry.") + try: + failed_job_registry.remove(job_id, delete_job=True) + count += 1 + except Exception: + ... + print(f"Deleted {count} jobs from the failed job registry.") def init_rq_cmds(app): diff --git a/pychunkedgraph/ingest/simple_tests.py b/pychunkedgraph/ingest/simple_tests.py new file mode 100644 index 000000000..07a60f5f3 --- /dev/null +++ b/pychunkedgraph/ingest/simple_tests.py @@ -0,0 +1,177 @@ +# pylint: disable=invalid-name, missing-function-docstring, broad-exception-caught + +""" +Some sanity tests to ensure chunkedgraph was created properly. +""" + +from datetime import datetime +import numpy as np + +from pychunkedgraph.graph import attributes, ChunkedGraph + + +def family(cg: ChunkedGraph): + np.random.seed(42) + n_chunks = 100 + n_segments_per_chunk = 200 + timestamp = datetime.utcnow() + + node_ids = [] + for layer in range(2, cg.meta.layer_count - 1): + for _ in range(n_chunks): + c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) + c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) + c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) + chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) + max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) + if max_segment_id < 10: + continue + + segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) + for segment_id in segment_ids: + node_ids.append( + cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id)) + ) + + rows = cg.client.read_nodes( + node_ids=node_ids, end_time=timestamp, properties=attributes.Hierarchy.Parent + ) + valid_node_ids = [] + non_valid_node_ids = [] + for k in rows.keys(): + if len(rows[k]) > 0: + valid_node_ids.append(k) + else: + non_valid_node_ids.append(k) + + parents = cg.get_parents(valid_node_ids, time_stamp=timestamp) + children_dict = cg.get_children(parents) + for child, parent in zip(valid_node_ids, parents): + assert child in children_dict[parent] + print("success") + + +def existence(cg: ChunkedGraph): + np.random.seed(42) + layer = 2 + n_chunks = 100 + n_segments_per_chunk = 200 + timestamp = datetime.utcnow() + node_ids = [] + for _ in range(n_chunks): + c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) + c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) + c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) + chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) + max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) + if max_segment_id < 10: + continue + + segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) + for segment_id in segment_ids: + node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) + + rows = cg.client.read_nodes( + node_ids=node_ids, end_time=timestamp, properties=attributes.Hierarchy.Parent + ) + valid_node_ids = [] + non_valid_node_ids = [] + for k in rows.keys(): + if len(rows[k]) > 0: + valid_node_ids.append(k) + else: + non_valid_node_ids.append(k) + + roots = [] + try: + roots = cg.get_roots(valid_node_ids) + assert len(roots) == len(valid_node_ids) + print("success") + except Exception as e: + print(f"Something went wrong: {e}") + print("At least one node failed. Checking nodes one by one:") + + if len(roots) != len(valid_node_ids): + log_dict = {} + success_dict = {} + for node_id in valid_node_ids: + try: + _ = cg.get_root(node_id, time_stamp=timestamp) + print(f"Success: {node_id} from chunk {cg.get_chunk_id(node_id)}") + success_dict[node_id] = True + except Exception as e: + print(f"{node_id} - chunk {cg.get_chunk_id(node_id)} failed: {e}") + success_dict[node_id] = False + t_id = node_id + while t_id is not None: + last_working_chunk = cg.get_chunk_id(t_id) + t_id = cg.get_parent(t_id) + + layer = cg.get_chunk_layer(last_working_chunk) + print(f"Failed on layer {layer} in chunk {last_working_chunk}") + log_dict[node_id] = last_working_chunk + + +def cross_edges(cg: ChunkedGraph): + np.random.seed(42) + layer = 2 + n_chunks = 10 + n_segments_per_chunk = 200 + timestamp = datetime.utcnow() + node_ids = [] + for _ in range(n_chunks): + c_x = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][0]) + c_y = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][1]) + c_z = np.random.randint(0, cg.meta.layer_chunk_bounds[layer][2]) + chunk_id = cg.get_chunk_id(layer=layer, x=c_x, y=c_y, z=c_z) + max_segment_id = cg.get_segment_id(cg.id_client.get_max_node_id(chunk_id)) + if max_segment_id < 10: + continue + + segment_ids = np.random.randint(1, max_segment_id, n_segments_per_chunk) + for segment_id in segment_ids: + node_ids.append(cg.get_node_id(np.uint64(segment_id), np.uint64(chunk_id))) + + rows = cg.client.read_nodes( + node_ids=node_ids, end_time=timestamp, properties=attributes.Hierarchy.Parent + ) + valid_node_ids = [] + non_valid_node_ids = [] + for k in rows.keys(): + if len(rows[k]) > 0: + valid_node_ids.append(k) + else: + non_valid_node_ids.append(k) + + cc_edges = cg.get_atomic_cross_edges(valid_node_ids) + cc_ids = np.unique( + np.concatenate( + [ + np.concatenate(list(v.values())) + for v in list(cc_edges.values()) + if len(v.values()) + ] + ) + ) + + roots = cg.get_roots(cc_ids) + root_dict = dict(zip(cc_ids, roots)) + root_dict_vec = np.vectorize(root_dict.get) + + for k in cc_edges: + if len(cc_edges[k]) == 0: + continue + local_ids = np.unique(np.concatenate(list(cc_edges[k].values()))) + assert len(np.unique(root_dict_vec(local_ids))) + print("success") + + +def run_all(cg: ChunkedGraph): + print("Running family tests:") + family(cg) + + print("\nRunning existence tests:") + existence(cg) + + print("\nRunning cross_edges tests:") + cross_edges(cg) diff --git a/pychunkedgraph/ingest/upgrade/__init__.py b/pychunkedgraph/ingest/upgrade/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pychunkedgraph/ingest/upgrade/atomic_layer.py b/pychunkedgraph/ingest/upgrade/atomic_layer.py new file mode 100644 index 000000000..c9c8bdb11 --- /dev/null +++ b/pychunkedgraph/ingest/upgrade/atomic_layer.py @@ -0,0 +1,85 @@ +# pylint: disable=invalid-name, missing-docstring, c-extension-no-member + +from datetime import timedelta + +import fastremap +import numpy as np +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph.attributes import Connectivity +from pychunkedgraph.graph.utils import serializers + +from .utils import exists_as_parent, get_parent_timestamps + + +def update_cross_edges( + cg: ChunkedGraph, node, cx_edges_d: dict, node_ts, timestamps: set, earliest_ts +) -> list: + """ + Helper function to update a single L2 ID. + Returns a list of mutations with given timestamps. + """ + rows = [] + edges = np.concatenate(list(cx_edges_d.values())) + uparents = np.unique(cg.get_parents(edges[:, 0], time_stamp=node_ts)) + assert uparents.size <= 1, f"{node}, {node_ts}, {uparents}" + if uparents.size == 0 or node != uparents[0]: + # if node is not the parent at this ts, it must be invalid + assert not exists_as_parent(cg, node, edges[:, 0]) + return rows + + partner_parent_ts_d = get_parent_timestamps(cg, edges[:, 1]) + for v in partner_parent_ts_d.values(): + timestamps.update(v) + + for ts in sorted(timestamps): + if ts < earliest_ts: + ts = earliest_ts + val_dict = {} + svs = edges[:, 1] + parents = cg.get_parents(svs, time_stamp=ts) + edge_parents_d = dict(zip(svs, parents)) + for layer, layer_edges in cx_edges_d.items(): + layer_edges = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + layer_edges[:, 0] = node + layer_edges = np.unique(layer_edges, axis=0) + col = Connectivity.CrossChunkEdge[layer] + val_dict[col] = layer_edges + row_id = serializers.serialize_uint64(node) + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=ts)) + return rows + + +def update_nodes(cg: ChunkedGraph, nodes) -> list: + nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) + earliest_ts = cg.get_earliest_timestamp() + timestamps_d = get_parent_timestamps(cg, nodes) + cx_edges_d = cg.get_atomic_cross_edges(nodes) + rows = [] + for node, node_ts in zip(nodes, nodes_ts): + if cg.get_parent(node) is None: + # invalid id caused by failed ingest task + continue + _cx_edges_d = cx_edges_d.get(node, {}) + if not _cx_edges_d: + continue + _rows = update_cross_edges( + cg, node, _cx_edges_d, node_ts, timestamps_d[node], earliest_ts + ) + rows.extend(_rows) + return rows + + +def update_chunk(cg: ChunkedGraph, chunk_coords: list[int], layer: int = 2): + """ + Iterate over all L2 IDs in a chunk and update their cross chunk edges, + within the periods they were valid/active. + """ + x, y, z = chunk_coords + chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + cg.copy_fake_edges(chunk_id) + rr = cg.range_read_chunk(chunk_id) + nodes = list(rr.keys()) + rows = update_nodes(cg, nodes) + cg.client.write(rows) diff --git a/pychunkedgraph/ingest/upgrade/parent_layer.py b/pychunkedgraph/ingest/upgrade/parent_layer.py new file mode 100644 index 000000000..6f0b08711 --- /dev/null +++ b/pychunkedgraph/ingest/upgrade/parent_layer.py @@ -0,0 +1,179 @@ +# pylint: disable=invalid-name, missing-docstring, c-extension-no-member + +import math, random, time +import multiprocessing as mp +from collections import defaultdict + +import fastremap +import numpy as np +from tqdm import tqdm + +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph.attributes import Connectivity, Hierarchy +from pychunkedgraph.graph.edits import get_supervoxels +from pychunkedgraph.graph.utils import serializers +from pychunkedgraph.graph.types import empty_2d +from pychunkedgraph.utils.general import chunked + +from .utils import exists_as_parent, get_parent_timestamps + + +CHILDREN = {} +CX_EDGES = {} + + +def _populate_nodes_and_children( + cg: ChunkedGraph, chunk_id: np.uint64, nodes: list = None +) -> dict: + global CHILDREN + if nodes: + CHILDREN = cg.get_children(nodes) + return + response = cg.range_read_chunk(chunk_id, properties=Hierarchy.Child) + for k, v in response.items(): + CHILDREN[k] = v[0].value + + +def _get_cx_edges_at_timestamp(node, response, ts): + result = defaultdict(list) + for child in CHILDREN[node]: + if child not in response: + continue + for key, cells in response[child].items(): + for cell in cells: + # cells are sorted in descending order of timestamps + if ts >= cell.timestamp: + result[key.index].append(cell.value) + break + for layer, edges in result.items(): + result[layer] = np.concatenate(edges) + return result + + +def _populate_cx_edges_with_timestamps( + cg: ChunkedGraph, layer: int, nodes: list, nodes_ts: list, earliest_ts +): + """ + Collect timestamps of edits from children, since we use the same timestamp + for all IDs involved in an edit, we can use the timestamps of + when cross edges of children were updated. + """ + global CX_EDGES + attrs = [Connectivity.CrossChunkEdge[l] for l in range(layer, cg.meta.layer_count)] + all_children = np.concatenate(list(CHILDREN.values())) + response = cg.client.read_nodes(node_ids=all_children, properties=attrs) + timestamps_d = get_parent_timestamps(cg, nodes) + for node, node_ts in zip(nodes, nodes_ts): + CX_EDGES[node] = {} + timestamps = timestamps_d[node] + cx_edges_d_node_ts = _get_cx_edges_at_timestamp(node, response, node_ts) + + edges = np.concatenate([empty_2d] + list(cx_edges_d_node_ts.values())) + partner_parent_ts_d = get_parent_timestamps(cg, edges[:, 1]) + for v in partner_parent_ts_d.values(): + timestamps.update(v) + CX_EDGES[node][node_ts] = cx_edges_d_node_ts + + for ts in sorted(timestamps): + if ts < earliest_ts: + ts = earliest_ts + CX_EDGES[node][ts] = _get_cx_edges_at_timestamp(node, response, ts) + + +def update_cross_edges(cg: ChunkedGraph, layer, node, node_ts, earliest_ts) -> list: + """ + Helper function to update a single ID. + Returns a list of mutations with timestamps. + """ + rows = [] + if node_ts > earliest_ts: + try: + cx_edges_d = CX_EDGES[node][node_ts] + except KeyError: + raise KeyError(f"{node}:{node_ts}") + edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) + if edges.size: + parents = cg.get_roots( + edges[:, 0], time_stamp=node_ts, stop_layer=layer, ceil=False + ) + uparents = np.unique(parents) + layers = cg.get_chunk_layers(uparents) + uparents = uparents[layers == layer] + assert uparents.size <= 1, f"{node}, {node_ts}, {uparents}" + if uparents.size == 0 or node != uparents[0]: + # if node is not the parent at this ts, it must be invalid + assert not exists_as_parent(cg, node, edges[:, 0]), f"{node}, {node_ts}" + return rows + + for ts, cx_edges_d in CX_EDGES[node].items(): + edges = np.concatenate([empty_2d] + list(cx_edges_d.values())) + if edges.size == 0: + continue + nodes = np.unique(edges[:, 1]) + svs = get_supervoxels(cg, nodes) + parents = cg.get_roots(svs, time_stamp=ts, stop_layer=layer, ceil=False) + edge_parents_d = dict(zip(nodes, parents)) + val_dict = {} + for _layer, layer_edges in cx_edges_d.items(): + layer_edges = fastremap.remap( + layer_edges, edge_parents_d, preserve_missing_labels=True + ) + layer_edges[:, 0] = node + layer_edges = np.unique(layer_edges, axis=0) + col = Connectivity.CrossChunkEdge[_layer] + val_dict[col] = layer_edges + row_id = serializers.serialize_uint64(node) + rows.append(cg.client.mutate_row(row_id, val_dict, time_stamp=ts)) + return rows + + +def _update_cross_edges_helper(args): + cg_info, layer, nodes, nodes_ts, earliest_ts = args + rows = [] + cg = ChunkedGraph(**cg_info) + parents = cg.get_parents(nodes, fail_to_zero=True) + for node, parent, node_ts in zip(nodes, parents, nodes_ts): + if parent == 0: + # invalid id caused by failed ingest task + continue + _rows = update_cross_edges(cg, layer, node, node_ts, earliest_ts) + rows.extend(_rows) + cg.client.write(rows) + + +def update_chunk( + cg: ChunkedGraph, chunk_coords: list[int], layer: int, nodes: list = None +): + """ + Iterate over all layer IDs in a chunk and update their cross chunk edges. + """ + start = time.time() + x, y, z = chunk_coords + chunk_id = cg.get_chunk_id(layer=layer, x=x, y=y, z=z) + earliest_ts = cg.get_earliest_timestamp() + _populate_nodes_and_children(cg, chunk_id, nodes=nodes) + if not CHILDREN: + return + nodes = list(CHILDREN.keys()) + random.shuffle(nodes) + nodes_ts = cg.get_node_timestamps(nodes, return_numpy=False, normalize=True) + _populate_cx_edges_with_timestamps(cg, layer, nodes, nodes_ts, earliest_ts) + + task_size = int(math.ceil(len(nodes) / mp.cpu_count() / 2)) + chunked_nodes = chunked(nodes, task_size) + chunked_nodes_ts = chunked(nodes_ts, task_size) + cg_info = cg.get_serialized_info() + + tasks = [] + for chunk, ts_chunk in zip(chunked_nodes, chunked_nodes_ts): + args = (cg_info, layer, chunk, ts_chunk, earliest_ts) + tasks.append(args) + + with mp.Pool(min(mp.cpu_count(), len(tasks))) as pool: + _ = list( + tqdm( + pool.imap_unordered(_update_cross_edges_helper, tasks), + total=len(tasks), + ) + ) + print(f"total elaspsed time: {time.time() - start}") diff --git a/pychunkedgraph/ingest/upgrade/utils.py b/pychunkedgraph/ingest/upgrade/utils.py new file mode 100644 index 000000000..cc43b561a --- /dev/null +++ b/pychunkedgraph/ingest/upgrade/utils.py @@ -0,0 +1,63 @@ +# pylint: disable=invalid-name, missing-docstring + +from collections import defaultdict +from datetime import timedelta + +import numpy as np +from pychunkedgraph.graph import ChunkedGraph +from pychunkedgraph.graph.attributes import Hierarchy + + +def exists_as_parent(cg: ChunkedGraph, parent, nodes) -> bool: + """ + Check if a given l2 parent is in the history of given nodes. + """ + response = cg.client.read_nodes(node_ids=nodes, properties=Hierarchy.Parent) + parents = set() + for cells in response.values(): + parents.update([cell.value for cell in cells]) + return parent in parents + + +def get_edit_timestamps(cg: ChunkedGraph, edges_d, start_ts, end_ts) -> list: + """ + Timestamps of when post-side nodes were involved in an edit. + Post-side - nodes in the neighbor chunk. + This is required because we need to update edges from both sides. + """ + cx_edges = np.concatenate(list(edges_d.values())) + timestamps = get_parent_timestamps( + cg, cx_edges[:, 1], start_time=start_ts, end_time=end_ts + ) + timestamps.add(start_ts) + return sorted(timestamps) + + +def get_end_ts(cg: ChunkedGraph, children, start_ts): + # get end_ts when node becomes invalid (bigtable resolution is in ms) + start = start_ts + timedelta(milliseconds=1) + _timestamps = get_parent_timestamps(cg, children, start_time=start) + try: + end_ts = sorted(_timestamps)[0] + except IndexError: + # start_ts == end_ts means there has been no edit involving this node + # meaning only one timestamp to update cross edges, start_ts + end_ts = start_ts + return end_ts + + +def get_parent_timestamps(cg: ChunkedGraph, nodes) -> dict[int, set]: + """ + Timestamps of when the given nodes were edited. + """ + response = cg.client.read_nodes( + node_ids=nodes, + properties=[Hierarchy.Parent], + end_time_inclusive=False, + ) + + result = defaultdict(set) + for k, v in response.items(): + for cell in v[Hierarchy.Parent]: + result[k].add(cell.timestamp) + return result diff --git a/pychunkedgraph/ingest/utils.py b/pychunkedgraph/ingest/utils.py index fa7ef7a3c..45b6e728f 100644 --- a/pychunkedgraph/ingest/utils.py +++ b/pychunkedgraph/ingest/utils.py @@ -1,14 +1,25 @@ -from typing import Tuple +# pylint: disable=invalid-name, missing-docstring +import logging +import functools +import math +from os import environ +from time import sleep +from typing import Any, Generator, Tuple -from . import ClusterIngestConfig -from . import IngestConfig -from ..graph.meta import ChunkedGraphMeta -from ..graph.meta import DataSource -from ..graph.meta import GraphConfig +import numpy as np +import tensorstore as ts +from rq import Queue, Worker +from rq.worker import WorkerStatus +from . import IngestConfig +from .manager import IngestionManager +from ..graph.meta import ChunkedGraphMeta, DataSource, GraphConfig from ..graph.client import BackendClientInfo from ..graph.client.bigtable import BigTableConfig +from ..utils.general import chunked +from ..utils.redis import get_redis_connection +from ..utils.redis import keys as r_keys chunk_id_str = lambda layer, coords: f"{layer}_{'_'.join(map(str, coords))}" @@ -16,14 +27,12 @@ def bootstrap( graph_id: str, config: dict, - overwrite: bool = False, raw: bool = False, test_run: bool = False, ) -> Tuple[ChunkedGraphMeta, IngestConfig, BackendClientInfo]: """Parse config loaded from a yaml file.""" ingest_config = IngestConfig( **config.get("ingest_config", {}), - CLUSTER=ClusterIngestConfig(), USE_RAW_EDGES=raw, USE_RAW_COMPONENTS=raw, TEST_RUN=test_run, @@ -33,7 +42,7 @@ def bootstrap( graph_config = GraphConfig( ID=f"{graph_id}", - OVERWRITE=overwrite, + OVERWRITE=False, **config["graph_config"], ) data_source = DataSource(**config["data_source"]) @@ -72,4 +81,146 @@ def postprocess_edge_data(im, edge_dict): return new_edge_dict else: - raise Exception(f"Unknown data_version: {data_version}") + raise ValueError(f"Unknown data_version: {data_version}") + + +def start_ocdbt_server(imanager: IngestionManager, server: Any): + spec = {"driver": "ocdbt", "base": f"{imanager.cg.meta.data_source.EDGES}/ocdbt"} + spec["coordinator"] = {"address": f"localhost:{server.port}"} + ts.KvStore.open(spec).result() + imanager.redis.set("OCDBT_COORDINATOR_PORT", str(server.port)) + ocdbt_host = environ.get("MY_POD_IP", "localhost") + imanager.redis.set("OCDBT_COORDINATOR_HOST", ocdbt_host) + logging.info(f"OCDBT Coordinator address {ocdbt_host}:{server.port}") + + +def randomize_grid_points(X: int, Y: int, Z: int) -> Generator[int, int, int]: + indices = np.arange(X * Y * Z) + np.random.shuffle(indices) + for index in indices: + yield np.unravel_index(index, (X, Y, Z)) + + +def get_chunks_not_done(imanager: IngestionManager, layer: int, coords: list) -> list: + """check for set membership in redis in batches""" + coords_strs = ["_".join(map(str, coord)) for coord in coords] + try: + completed = imanager.redis.smismember(f"{layer}c", coords_strs) + except Exception: + return coords + return [coord for coord, c in zip(coords, completed) if not c] + + +def print_completion_rate(imanager: IngestionManager, layer: int, span: int = 10): + counts = [] + for _ in range(span + 1): + counts.append(imanager.redis.scard(f"{layer}c")) + sleep(1) + rate = np.diff(counts).sum() / span + print(f"{rate} chunks per second.") + + +def print_status(imanager: IngestionManager, redis, upgrade: bool = False): + """ + Helper to print status to console. + If `upgrade=True`, status does not include the root layer, + since there is no need to update cross edges for root ids. + """ + layers = range(2, imanager.cg_meta.layer_count + 1) + if upgrade: + layers = range(2, imanager.cg_meta.layer_count) + layer_counts = imanager.cg_meta.layer_chunk_counts + + pipeline = redis.pipeline() + pipeline.get(r_keys.JOB_TYPE) + worker_busy = [] + for layer in layers: + pipeline.scard(f"{layer}c") + queue = Queue(f"l{layer}", connection=redis) + pipeline.llen(queue.key) + pipeline.zcard(queue.failed_job_registry.key) + workers = Worker.all(queue=queue) + worker_busy.append(sum([w.get_state() == WorkerStatus.BUSY for w in workers])) + + results = pipeline.execute() + job_type = "not_available" + if results[0] is not None: + job_type = results[0].decode() + completed = [] + queued = [] + failed = [] + for i in range(1, len(results), 3): + result = results[i : i + 3] + completed.append(result[0]) + queued.append(result[1]) + failed.append(result[2]) + + header = ( + f"\njob_type: \t{job_type}" + f"\nversion: \t{imanager.cg.version}" + f"\ngraph_id: \t{imanager.cg.graph_id}" + f"\nchunk_size: \t{imanager.cg.meta.graph_config.CHUNK_SIZE}" + "\n\nlayer status:" + ) + print(header) + for layer, done, count in zip(layers, completed, layer_counts): + print(f"{layer}\t| {done:9} / {count} \t| {math.floor((done/count)*100):6}%") + + print("\n\nqueue status:") + for layer, q, f, wb in zip(layers, queued, failed, worker_busy): + print(f"l{layer}\t| queued: {q:<10} failed: {f:<10} busy: {wb}") + + +def queue_layer_helper(parent_layer: int, imanager: IngestionManager, fn): + if parent_layer == imanager.cg_meta.layer_count: + chunk_coords = [(0, 0, 0)] + else: + bounds = imanager.cg_meta.layer_chunk_bounds[parent_layer] + chunk_coords = randomize_grid_points(*bounds) + + q = imanager.get_task_queue(f"l{parent_layer}") + batch_size = int(environ.get("JOB_BATCH_SIZE", 10000)) + timeout_scale = int(environ.get("TIMEOUT_SCALE_FACTOR", 1)) + batches = chunked(chunk_coords, batch_size) + for batch in batches: + _coords = get_chunks_not_done(imanager, parent_layer, batch) + # buffer for optimal use of redis memory + if len(q) > int(environ.get("QUEUE_SIZE", 100000)): + interval = int(environ.get("QUEUE_INTERVAL", 300)) + logging.info(f"Queue full; sleeping {interval}s...") + sleep(interval) + + job_datas = [] + for chunk_coord in _coords: + job_datas.append( + Queue.prepare_data( + fn, + args=(parent_layer, chunk_coord), + result_ttl=0, + job_id=chunk_id_str(parent_layer, chunk_coord), + timeout=f"{timeout_scale * int(parent_layer * parent_layer)}m", + ) + ) + q.enqueue_many(job_datas) + + +def job_type_guard(job_type: str): + def decorator_job_type_guard(func): + @functools.wraps(func) + def wrapper_job_type_guard(*args, **kwargs): + redis = get_redis_connection() + current_type = redis.get(r_keys.JOB_TYPE) + if current_type is not None: + current_type = current_type.decode() + msg = ( + f"Currently running `{current_type}`. You're attempting to run `{job_type}`." + f"\nRun `[flask] {current_type} flush_redis` to clear the current job and restart." + ) + if current_type != job_type: + print(f"\n*WARNING*\n{msg}") + exit(1) + return func(*args, **kwargs) + + return wrapper_job_type_guard + + return decorator_job_type_guard diff --git a/pychunkedgraph/repair/edits.py b/pychunkedgraph/repair/edits.py index cb403a380..849b17e08 100644 --- a/pychunkedgraph/repair/edits.py +++ b/pychunkedgraph/repair/edits.py @@ -56,8 +56,6 @@ def repair_operation( op_ids_to_retry.append(locked_op) print(f"{node_id} indefinitely locked by op {locked_op}") print(f"total to retry: {len(op_ids_to_retry)}") - - logs = cg.client.read_log_entries(op_ids_to_retry) - for op_id, log in logs.items(): + for op_id in op_ids_to_retry: print(f"repairing {op_id}") - repair_operation(cg, log, op_id) + repair_operation(cg, op_id) diff --git a/pychunkedgraph/tests/helpers.py b/pychunkedgraph/tests/helpers.py index de5314422..551c596bf 100644 --- a/pychunkedgraph/tests/helpers.py +++ b/pychunkedgraph/tests/helpers.py @@ -14,12 +14,12 @@ from google.cloud import bigtable from ..ingest.utils import bootstrap -from ..ingest.create.atomic_layer import add_atomic_edges +from ..ingest.create.atomic_layer import add_atomic_chunk from ..graph.edges import Edges from ..graph.edges import EDGE_TYPES from ..graph.utils import basetypes from ..graph.chunkedgraph import ChunkedGraph -from ..ingest.create.abstract_layers import add_layer +from ..ingest.create.parent_layer import add_parent_chunk class CloudVolumeBounds(object): @@ -120,7 +120,7 @@ def _cgraph(request, n_layers=10, atomic_chunk_bounds: np.ndarray = np.array([]) "FANOUT": 2, "SPATIAL_BITS": 10, "ID_PREFIX": "", - "ROOT_LOCK_EXPIRY": timedelta(seconds=5) + "ROOT_LOCK_EXPIRY": timedelta(seconds=5), }, "backend_client": { "TYPE": "bigtable", @@ -130,15 +130,14 @@ def _cgraph(request, n_layers=10, atomic_chunk_bounds: np.ndarray = np.array([]) "PROJECT": "IGNORE_ENVIRONMENT_PROJECT", "INSTANCE": "emulated_instance", "CREDENTIALS": credentials.AnonymousCredentials(), - "MAX_ROW_KEY_COUNT": 1000 + "MAX_ROW_KEY_COUNT": 1000, }, }, "ingest_config": {}, } meta, _, client_info = bootstrap("test", config=config) - graph = ChunkedGraph(graph_id="test", meta=meta, - client_info=client_info) + graph = ChunkedGraph(graph_id="test", meta=meta, client_info=client_info) graph.mock_edges = Edges([], []) graph.meta._ws_cv = CloudVolumeMock() graph.meta.layer_count = n_layers @@ -176,8 +175,7 @@ def gen_graph_simplequerytest(request, gen_graph): # Chunk B create_chunk( graph, - vertices=[to_label(graph, 1, 1, 0, 0, 0), - to_label(graph, 1, 1, 0, 0, 1)], + vertices=[to_label(graph, 1, 1, 0, 0, 0), to_label(graph, 1, 1, 0, 0, 1)], edges=[ (to_label(graph, 1, 1, 0, 0, 0), to_label(graph, 1, 1, 0, 0, 1), 0.5), (to_label(graph, 1, 1, 0, 0, 0), to_label(graph, 1, 2, 0, 0, 0), inf), @@ -188,13 +186,12 @@ def gen_graph_simplequerytest(request, gen_graph): create_chunk( graph, vertices=[to_label(graph, 1, 2, 0, 0, 0)], - edges=[(to_label(graph, 1, 2, 0, 0, 0), - to_label(graph, 1, 1, 0, 0, 0), inf)], + edges=[(to_label(graph, 1, 2, 0, 0, 0), to_label(graph, 1, 1, 0, 0, 0), inf)], ) - add_layer(graph, 3, [0, 0, 0], n_threads=1) - add_layer(graph, 3, [1, 0, 0], n_threads=1) - add_layer(graph, 4, [0, 0, 0], n_threads=1) + add_parent_chunk(graph, 3, [0, 0, 0], n_threads=1) + add_parent_chunk(graph, 3, [1, 0, 0], n_threads=1) + add_parent_chunk(graph, 4, [0, 0, 0], n_threads=1) return graph @@ -206,8 +203,7 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): edges = edges if edges else [] vertices = vertices if vertices else [] vertices = np.unique(np.array(vertices, dtype=np.uint64)) - edges = [(np.uint64(v1), np.uint64(v2), np.float32(aff)) - for v1, v2, aff in edges] + edges = [(np.uint64(v1), np.uint64(v2), np.float32(aff)) for v1, v2, aff in edges] isolated_ids = [ x for x in vertices @@ -230,8 +226,7 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): chunk_id = None if len(chunk_edges_active[EDGE_TYPES.in_chunk]): - chunk_id = cg.get_chunk_id( - chunk_edges_active[EDGE_TYPES.in_chunk].node_ids1[0]) + chunk_id = cg.get_chunk_id(chunk_edges_active[EDGE_TYPES.in_chunk].node_ids1[0]) elif len(vertices): chunk_id = cg.get_chunk_id(vertices[0]) @@ -257,11 +252,12 @@ def create_chunk(cg, vertices=None, edges=None, timestamp=None): cg.mock_edges += all_edges isolated_ids = np.array(isolated_ids, dtype=np.uint64) - add_atomic_edges( + add_atomic_chunk( cg, cg.get_chunk_coordinates(chunk_id), chunk_edges_active, isolated=isolated_ids, + time_stamp=timestamp, ) @@ -282,21 +278,21 @@ def get_layer_chunk_bounds( return layer_bounds_d -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def sv_data(): - test_data_dir = 'pychunkedgraph/tests/data' - edges_file = f'{test_data_dir}/sv_edges.npy' + test_data_dir = "pychunkedgraph/tests/data" + edges_file = f"{test_data_dir}/sv_edges.npy" sv_edges = np.load(edges_file) - source_file = f'{test_data_dir}/sv_sources.npy' + source_file = f"{test_data_dir}/sv_sources.npy" sv_sources = np.load(source_file) - sinks_file = f'{test_data_dir}/sv_sinks.npy' + sinks_file = f"{test_data_dir}/sv_sinks.npy" sv_sinks = np.load(sinks_file) - affinity_file = f'{test_data_dir}/sv_affinity.npy' + affinity_file = f"{test_data_dir}/sv_affinity.npy" sv_affinity = np.load(affinity_file) - area_file = f'{test_data_dir}/sv_area.npy' + area_file = f"{test_data_dir}/sv_area.npy" sv_area = np.load(area_file) yield (sv_edges, sv_sources, sv_sinks, sv_affinity, sv_area) diff --git a/pychunkedgraph/tests/test_uncategorized.py b/pychunkedgraph/tests/test_uncategorized.py index 93c41158d..5c2de29d4 100644 --- a/pychunkedgraph/tests/test_uncategorized.py +++ b/pychunkedgraph/tests/test_uncategorized.py @@ -1,20 +1,10 @@ -import collections -import os -import subprocess -import sys from time import sleep -from datetime import datetime, timedelta -from functools import partial +from datetime import datetime, timedelta, UTC from math import inf -from signal import SIGTERM -from unittest import mock from warnings import warn import numpy as np import pytest -from google.auth import credentials -from google.cloud import bigtable -from grpc._channel import _Rendezvous from .helpers import ( bigtable_emulator, @@ -24,19 +14,20 @@ to_label, sv_data, ) +from ..graph import ChunkedGraph from ..graph import types from ..graph import attributes from ..graph import exceptions -from ..graph import chunkedgraph from ..graph.edges import Edges from ..graph.utils import basetypes -from ..graph.misc import get_delta_roots +from ..graph.lineage import lineage_graph +from ..graph.misc import get_delta_roots, get_latest_roots from ..graph.cutting import run_multicut from ..graph.lineage import get_root_id_history from ..graph.lineage import get_future_root_ids from ..graph.utils.serializers import serialize_uint64 from ..graph.utils.serializers import deserialize_uint64 -from ..ingest.create.abstract_layers import add_layer +from ..ingest.create.parent_layer import add_parent_chunk class TestGraphNodeConversion: @@ -68,9 +59,9 @@ def test_node_id_adjacency(self, gen_graph): ) == cg.get_node_id(np.uint64(1), layer=2, x=3, y=1, z=0) assert cg.get_node_id( - np.uint64(2 ** 53 - 2), layer=10, x=0, y=0, z=0 + np.uint64(2**53 - 2), layer=10, x=0, y=0, z=0 ) + np.uint64(1) == cg.get_node_id( - np.uint64(2 ** 53 - 1), layer=10, x=0, y=0, z=0 + np.uint64(2**53 - 1), layer=10, x=0, y=0, z=0 ) @pytest.mark.timeout(30) @@ -82,9 +73,9 @@ def test_serialize_node_id(self, gen_graph): ) < serialize_uint64(cg.get_node_id(np.uint64(1), layer=2, x=3, y=1, z=0)) assert serialize_uint64( - cg.get_node_id(np.uint64(2 ** 53 - 2), layer=10, x=0, y=0, z=0) + cg.get_node_id(np.uint64(2**53 - 2), layer=10, x=0, y=0, z=0) ) < serialize_uint64( - cg.get_node_id(np.uint64(2 ** 53 - 1), layer=10, x=0, y=0, z=0) + cg.get_node_id(np.uint64(2**53 - 1), layer=10, x=0, y=0, z=0) ) @pytest.mark.timeout(30) @@ -222,7 +213,7 @@ def test_build_single_across_edge(self, gen_graph): edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], ) - add_layer(cg, 3, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 3, [0, 0, 0], n_threads=1) res = cg.client._table.read_rows() res.consume_all() @@ -327,7 +318,7 @@ def test_build_single_edge_and_single_across_edge(self, gen_graph): edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], ) - add_layer(cg, 3, np.array([0, 0, 0]), n_threads=1) + add_parent_chunk(cg, 3, np.array([0, 0, 0]), n_threads=1) res = cg.client._table.read_rows() res.consume_all() @@ -424,10 +415,10 @@ def test_build_big_graph(self, gen_graph): # Preparation: Build Chunk Z create_chunk(cg, vertices=[to_label(cg, 1, 7, 7, 7, 0)], edges=[]) - add_layer(cg, 3, [0, 0, 0], n_threads=1) - add_layer(cg, 3, [3, 3, 3], n_threads=1) - add_layer(cg, 4, [0, 0, 0], n_threads=1) - add_layer(cg, 5, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 3, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 3, [3, 3, 3], n_threads=1) + add_parent_chunk(cg, 4, [0, 0, 0], n_threads=1) + add_parent_chunk(cg, 5, [0, 0, 0], n_threads=1) res = cg.client._table.read_rows() res.consume_all() @@ -452,7 +443,7 @@ def test_double_chunk_creation(self, gen_graph): cg = gen_graph(n_layers=4, atomic_chunk_bounds=atomic_chunk_bounds) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -468,21 +459,21 @@ def test_double_chunk_creation(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], @@ -775,7 +766,7 @@ def test_merge_pair_same_chunk(self, gen_graph): cg = gen_graph(n_layers=2, atomic_chunk_bounds=atomic_chunk_bounds) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -815,7 +806,7 @@ def test_merge_pair_neighboring_chunks(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -831,7 +822,7 @@ def test_merge_pair_neighboring_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -871,7 +862,7 @@ def test_merge_pair_disconnected_chunks(self, gen_graph): cg = gen_graph(n_layers=5) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -887,28 +878,28 @@ def test_merge_pair_disconnected_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [3, 3, 3], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 5, [0, 0, 0], @@ -955,7 +946,7 @@ def test_merge_pair_already_connected(self, gen_graph): cg = gen_graph(n_layers=2) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -996,7 +987,7 @@ def test_merge_triple_chain_to_full_circle_same_chunk(self, gen_graph): cg = gen_graph(n_layers=2) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[ @@ -1033,7 +1024,7 @@ def test_merge_triple_chain_to_full_circle_neighboring_chunks(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -1052,7 +1043,7 @@ def test_merge_triple_chain_to_full_circle_neighboring_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1082,7 +1073,7 @@ def test_merge_triple_chain_to_full_circle_disconnected_chunks(self, gen_graph): cg = gen_graph(n_layers=5) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], @@ -1111,35 +1102,35 @@ def test_merge_triple_chain_to_full_circle_disconnected_chunks(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [3, 3, 3], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [1, 1, 1], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 5, [0, 0, 0], @@ -1181,7 +1172,7 @@ def test_merge_same_node(self, gen_graph): cg = gen_graph(n_layers=2) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1223,7 +1214,7 @@ def test_merge_pair_abstract_nodes(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1239,7 +1230,7 @@ def test_merge_pair_abstract_nodes(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1314,7 +1305,7 @@ def test_diagonal_connections(self, gen_graph): edges=[(to_label(cg, 1, 1, 1, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf)], ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1352,7 +1343,7 @@ def test_cross_edges(self, gen_graph): cg = gen_graph(n_layers=5) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[ @@ -1405,28 +1396,28 @@ def test_cross_edges(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 3, [1, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 4, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, ) - add_layer( + add_parent_chunk( cg, 5, [0, 0, 0], @@ -1466,81 +1457,72 @@ def test_multiple_cuts_and_splits(self, gen_graph_simplequerytest): child_ids = np.concatenate(child_ids) for i in range(10): - - print(f"\n\nITERATION {i}/10") - print("\n\nMERGE 1 & 3\n\n") + print(f"\n\nITERATION {i}/10 - MERGE 1 & 3") new_roots = cg.add_edges( "Jane Doe", [to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], affinities=0.9, ).new_root_ids - assert len(new_roots) == 1 + assert len(new_roots) == 1, new_roots assert len(cg.get_subgraph([new_roots[0]], leaves_only=True)) == 4 - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) - + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) - assert len(u_root_ids) == 1 + assert len(u_root_ids) == 1, u_root_ids # ------------------------------------------------------------------ + print(f"\n\nITERATION {i}/10 - SPLIT 2 & 3") new_roots = cg.remove_edges( "John Doe", source_ids=to_label(cg, 1, 1, 0, 0, 0), sink_ids=to_label(cg, 1, 1, 0, 0, 1), mincut=False, ).new_root_ids + assert len(new_roots) == 2, new_roots - assert len(np.unique(new_roots)) == 2 - - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) - + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) these_child_ids = [] for root_id in u_root_ids: these_child_ids.extend(cg.get_subgraph([root_id], leaves_only=True)) assert len(these_child_ids) == 4 - assert len(u_root_ids) == 2 + assert len(u_root_ids) == 2, u_root_ids # ------------------------------------------------------------------ - + print(f"\n\nITERATION {i}/10 - SPLIT 1 & 3") new_roots = cg.remove_edges( "Jane Doe", source_ids=to_label(cg, 1, 0, 0, 0, 0), sink_ids=to_label(cg, 1, 1, 0, 0, 1), mincut=False, ).new_root_ids - assert len(new_roots) == 2 - - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) + assert len(new_roots) == 2, new_roots + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) - assert len(u_root_ids) == 3 + assert len(u_root_ids) == 3, u_root_ids # ------------------------------------------------------------------ - - print(f"\n\nITERATION {i}/10") - print("\n\nMERGE 2 & 3\n\n") - + print(f"\n\nITERATION {i}/10 - MERGE 2 & 3") new_roots = cg.add_edges( "Jane Doe", [to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], affinities=0.9, ).new_root_ids - assert len(new_roots) == 1 - - root_ids = [] - for child_id in child_ids: - root_ids.append(cg.get_root(child_id)) + assert len(new_roots) == 1, new_roots + root_ids = cg.get_roots(child_ids, assert_roots=True) + print(child_ids) + print(list(root_ids)) u_root_ids = np.unique(root_ids) - assert len(u_root_ids) == 2 + assert len(u_root_ids) == 2, u_root_ids # for root_id in root_ids: # cross_edge_dict_layers = graph_tests.root_cross_edge_test( @@ -1575,7 +1557,7 @@ def test_cut_regular_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1591,7 +1573,7 @@ def test_cut_regular_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1614,7 +1596,7 @@ def test_cut_regular_link(self, gen_graph): disallow_isolating_cut=True, ).new_root_ids - # Check New State + # verify new state assert len(new_root_ids) == 2 assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( to_label(cg, 1, 1, 0, 0, 0) @@ -1646,7 +1628,7 @@ def test_cut_no_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1662,7 +1644,7 @@ def test_cut_no_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1707,7 +1689,7 @@ def test_cut_old_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1723,7 +1705,7 @@ def test_cut_old_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1775,7 +1757,7 @@ def test_cut_indivisible_link(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], @@ -1791,7 +1773,7 @@ def test_cut_indivisible_link(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1837,7 +1819,7 @@ def test_mincut_disrespects_sources_or_sinks(self, gen_graph): """ cg = gen_graph(n_layers=2) - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[ @@ -1877,13 +1859,11 @@ def test_path_augmented_multicut(self, sv_data): edges = Edges( sv_edges[:, 0], sv_edges[:, 1], affinities=sv_affinity, areas=sv_area ) - cut_edges_aug = run_multicut(edges, sv_sources, sv_sinks, path_augment=True) assert cut_edges_aug.shape[0] == 350 with pytest.raises(exceptions.PreconditionError): run_multicut(edges, sv_sources, sv_sinks, path_augment=False) - pass class TestGraphHistory: @@ -1901,20 +1881,14 @@ def test_cut_merge_history(self, gen_graph): (1) Split 1 and 2 (2) Merge 1 and 2 """ - from ..graph.lineage import lineage_graph - - cg = gen_graph(n_layers=3) - - # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 0)], edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 0.5)], timestamp=fake_timestamp, ) - - # Preparation: Build Chunk B create_chunk( cg, vertices=[to_label(cg, 1, 1, 0, 0, 0)], @@ -1922,7 +1896,7 @@ def test_cut_merge_history(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -1932,7 +1906,7 @@ def test_cut_merge_history(self, gen_graph): first_root = cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) assert first_root == cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) - timestamp_before_split = datetime.utcnow() + timestamp_before_split = datetime.now(UTC) split_roots = cg.remove_edges( "Jane Doe", source_ids=to_label(cg, 1, 0, 0, 0, 0), @@ -1945,7 +1919,7 @@ def test_cut_merge_history(self, gen_graph): g = lineage_graph(cg, split_roots) assert g.size() == 2 - timestamp_after_split = datetime.utcnow() + timestamp_after_split = datetime.now(UTC) merge_roots = cg.add_edges( "Jane Doe", [to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0)], @@ -1953,7 +1927,7 @@ def test_cut_merge_history(self, gen_graph): ).new_root_ids assert len(merge_roots) == 1 merge_root = merge_roots[0] - timestamp_after_merge = datetime.utcnow() + timestamp_after_merge = datetime.now(UTC) g = lineage_graph(cg, merge_roots) assert g.size() == 4 @@ -2047,7 +2021,7 @@ def test_lock_unlock(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2063,7 +2037,7 @@ def test_lock_unlock(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2113,7 +2087,7 @@ def test_lock_expiration(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2129,7 +2103,7 @@ def test_lock_expiration(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2181,7 +2155,7 @@ def test_lock_renew(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2197,7 +2171,7 @@ def test_lock_renew(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2233,7 +2207,7 @@ def test_lock_merge_lock_old_id(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2249,7 +2223,7 @@ def test_lock_merge_lock_old_id(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2299,7 +2273,7 @@ def test_indefinite_lock(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2315,7 +2289,7 @@ def test_indefinite_lock(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2372,7 +2346,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): cg = gen_graph(n_layers=3) # Preparation: Build Chunk A - fake_timestamp = datetime.utcnow() - timedelta(days=10) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) create_chunk( cg, vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2388,7 +2362,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): timestamp=fake_timestamp, ) - add_layer( + add_parent_chunk( cg, 3, [0, 0, 0], @@ -2451,7 +2425,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): # cg = gen_graph(n_layers=3) # # Preparation: Build Chunk A - # fake_timestamp = datetime.utcnow() - timedelta(days=10) + # fake_timestamp = datetime.now(UTC) - timedelta(days=10) # create_chunk( # cg, # vertices=[to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2)], @@ -2467,7 +2441,7 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): # timestamp=fake_timestamp, # ) - # add_layer( + # add_parent_chunk( # cg, 3, [0, 0, 0], time_stamp=fake_timestamp, n_threads=1, # ) @@ -2491,1054 +2465,951 @@ def test_indefinite_lock_with_normal_lock_expiration(self, gen_graph): # )[0] -# class MockChunkedGraph: -# """ -# Dummy class to mock partial functionality of the ChunkedGraph for use in unit tests. -# Feel free to add more functions as need be. Can pass in alternative member functions into constructor. -# """ - -# def __init__( -# self, get_chunk_coordinates=None, get_chunk_layer=None, get_chunk_id=None -# ): -# if get_chunk_coordinates is not None: -# self.get_chunk_coordinates = get_chunk_coordinates -# if get_chunk_layer is not None: -# self.get_chunk_layer = get_chunk_layer -# if get_chunk_id is not None: -# self.get_chunk_id = get_chunk_id - -# def get_chunk_coordinates(self, chunk_id): # pylint: disable=method-hidden -# return np.array([0, 0, 0]) - -# def get_chunk_layer(self, chunk_id): # pylint: disable=method-hidden -# return 2 - -# def get_chunk_id(self, *args): # pylint: disable=method-hidden -# return 0 - - -# class TestGraphSplit: -# @pytest.mark.timeout(30) -# def test_split_pair_same_chunk(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (same chunk) -# Expected: Different (new) parents for RG 1 and 2 on Layer two -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1━2 │ => │ 1 2 │ -# │ │ │ │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5)], -# timestamp=fake_timestamp, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 1), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 2 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 0, 0, 0, 1) -# ) -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 1))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 1) in leaves - -# # Check Old State still accessible -# assert cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) == cg.get_root(to_label(cg, 1, 0, 0, 0, 1), time_stamp=fake_timestamp) -# leaves = np.unique( -# cg.get_subgraph( -# [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], -# leaves_only=True, -# ) -# ) -# assert len(leaves) == 2 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves - -# # assert len(cg.get_latest_roots()) == 2 -# # assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# def test_split_nonexisting_edge(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (same chunk) -# Expected: Different (new) parents for RG 1 and 2 on Layer two -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1━2 │ => │ 1━2 │ -# │ | │ │ | │ -# │ 3 │ │ 3 │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 2), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 2), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 1 - -# @pytest.mark.timeout(30) -# def test_split_pair_neighboring_chunks(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) -# ┌─────┬─────┐ ┌─────┬─────┐ -# │ A¹ │ B¹ │ │ A¹ │ B¹ │ -# │ 1━━┿━━2 │ => │ 1 │ 2 │ -# │ │ │ │ │ │ -# └─────┴─────┘ └─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 1.0)], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 1.0)], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 2 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 1, 0, 0, 0) -# ) -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 1, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 1, 0, 0, 0) in leaves - -# # Check Old State still accessible -# assert cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) == cg.get_root(to_label(cg, 1, 1, 0, 0, 0), time_stamp=fake_timestamp) -# leaves = np.unique( -# cg.get_subgraph( -# [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], -# leaves_only=True, -# ) -# ) -# assert len(leaves) == 2 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 1, 0, 0, 0) in leaves - -# assert len(cg.get_latest_roots()) == 2 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_verify_cross_chunk_edges(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) -# ┌─────┬─────┬─────┐ ┌─────┬─────┬─────┐ -# | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ -# | │ 1━━┿━━3 │ => | │ 1━━┿━━3 │ -# | │ | │ │ | │ │ │ -# | │ 2 │ │ | │ 2 │ │ -# └─────┴─────┴─────┘ └─────┴─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=4) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 2, 0, 0, 0)], -# edges=[(to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf)], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 1, 0, 0, 1) -# ) -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 2, 0, 0, 0) -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 1, 0, 0, 1), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 2 - -# svs2 = cg.get_subgraph([new_root_ids[0]], leaves_only=True) -# svs1 = cg.get_subgraph([new_root_ids[1]], leaves_only=True) -# len_set = {1, 2} -# assert len(svs1) in len_set -# len_set.remove(len(svs1)) -# assert len(svs2) in len_set - -# # Check New State -# assert len(new_root_ids) == 2 -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 1, 0, 0, 1) -# ) -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 2, 0, 0, 0) -# ) - -# cc_dict = cg.get_atomic_cross_edges( -# cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) -# ) -# assert len(cc_dict[3]) == 1 -# assert cc_dict[3][0][0] == to_label(cg, 1, 1, 0, 0, 0) -# assert cc_dict[3][0][1] == to_label(cg, 1, 2, 0, 0, 0) - -# assert len(cg.get_latest_roots()) == 2 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_verify_loop(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) -# ┌─────┬────────┬─────┐ ┌─────┬────────┬─────┐ -# | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ -# | │ 4━━1━━┿━━5 │ => | │ 4 1━━┿━━5 │ -# | │ / │ | │ | │ │ | │ -# | │ 3 2━━┿━━6 │ | │ 3 2━━┿━━6 │ -# └─────┴────────┴─────┘ └─────┴────────┴─────┘ -# """ - -# cg = gen_graph(n_layers=4) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[ -# to_label(cg, 1, 1, 0, 0, 0), -# to_label(cg, 1, 1, 0, 0, 1), -# to_label(cg, 1, 1, 0, 0, 2), -# to_label(cg, 1, 1, 0, 0, 3), -# ], -# edges=[ -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), -# (to_label(cg, 1, 1, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 1), inf), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 2), 0.5), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 3), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), -# (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 1), inf), -# (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 0), 0.5), -# ], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 1, 0, 0, 1) -# ) -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 2, 0, 0, 0) -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 1, 0, 0, 2), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 2 - -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 1, 0, 0, 3), -# mincut=False, -# ).new_root_ids - -# assert len(new_root_ids) == 2 - -# cc_dict = cg.get_atomic_cross_edges( -# cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) -# ) -# assert len(cc_dict[3]) == 1 -# cc_dict = cg.get_atomic_cross_edges( -# cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) -# ) -# assert len(cc_dict[3]) == 1 - -# assert len(cg.get_latest_roots()) == 3 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_pair_disconnected_chunks(self, gen_graph): -# """ -# Remove edge between existing RG supervoxels 1 and 2 (disconnected chunks) -# ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ -# │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ -# │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ -# │ │ │ │ │ │ │ │ -# └─────┘ └─────┘ └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=9) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 7, 7, 7, 0), 1.0,)], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk Z -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 7, 7, 7, 0)], -# edges=[(to_label(cg, 1, 7, 7, 7, 0), to_label(cg, 1, 0, 0, 0, 0), 1.0,)], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 4, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 5, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 5, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 6, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 6, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 7, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 7, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 8, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 8, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# 9, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# # Split -# new_roots = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 7, 7, 7, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_roots) == 2 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( -# to_label(cg, 1, 7, 7, 7, 0) -# ) -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves -# leaves = np.unique( -# cg.get_subgraph([cg.get_root(to_label(cg, 1, 7, 7, 7, 0))], leaves_only=True) -# ) -# assert len(leaves) == 1 and to_label(cg, 1, 7, 7, 7, 0) in leaves - -# # Check Old State still accessible -# assert cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) == cg.get_root(to_label(cg, 1, 7, 7, 7, 0), time_stamp=fake_timestamp) -# leaves = np.unique( -# cg.get_subgraph( -# [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], -# leaves_only=True, -# ) -# ) -# assert len(leaves) == 2 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 7, 7, 7, 0) in leaves - -# @pytest.mark.timeout(30) -# def test_split_pair_already_disconnected(self, gen_graph): -# """ -# Try to remove edge between already disconnected RG supervoxels 1 and 2 (same chunk). -# Expected: No change, no error -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1 2 │ => │ 1 2 │ -# │ │ │ │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# res_old = cg.client._table.read_rows() -# res_old.consume_all() - -# # Split -# with pytest.raises(exceptions.PreconditionError): -# cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 1), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ) - -# res_new = cg.client._table.read_rows() -# res_new.consume_all() - -# # Check -# if res_old.rows != res_new.rows: -# warn( -# "Rows were modified when splitting a pair of already disconnected supervoxels. " -# "While probably not an error, it is an unnecessary operation." -# ) - -# @pytest.mark.timeout(30) -# def test_split_full_circle_to_triple_chain_same_chunk(self, gen_graph): -# """ -# Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (same chunk) -# ┌─────┐ ┌─────┐ -# │ A¹ │ │ A¹ │ -# │ 1━2 │ => │ 1 2 │ -# │ ┗3┛ │ │ ┗3┛ │ -# └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[ -# to_label(cg, 1, 0, 0, 0, 0), -# to_label(cg, 1, 0, 0, 0, 1), -# to_label(cg, 1, 0, 0, 0, 2), -# ], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 2), 0.5), -# (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2), 0.5), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.3), -# ], -# timestamp=fake_timestamp, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 1), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 1 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 2)) == new_root_ids[0] -# leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) -# assert len(leaves) == 3 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves -# assert to_label(cg, 1, 0, 0, 0, 2) in leaves - -# # Check Old State still accessible -# old_root_id = cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) -# assert new_root_ids[0] != old_root_id - -# # assert len(cg.get_latest_roots()) == 1 -# # assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_full_circle_to_triple_chain_neighboring_chunks(self, gen_graph): -# """ -# Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (neighboring chunks) -# ┌─────┬─────┐ ┌─────┬─────┐ -# │ A¹ │ B¹ │ │ A¹ │ B¹ │ -# │ 1━━┿━━2 │ => │ 1 │ 2 │ -# │ ┗3━┿━━┛ │ │ ┗3━┿━━┛ │ -# └─────┴─────┘ └─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 0), 0.5), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 0.3), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[ -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 0.3), -# ], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 1, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 1 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == new_root_ids[0] -# leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) -# assert len(leaves) == 3 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves -# assert to_label(cg, 1, 1, 0, 0, 0) in leaves - -# # Check Old State still accessible -# old_root_id = cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) -# assert new_root_ids[0] != old_root_id - -# assert len(cg.get_latest_roots()) == 1 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_full_circle_to_triple_chain_disconnected_chunks(self, gen_graph): -# """ -# Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (disconnected chunks) -# ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ -# │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ -# │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ -# │ ┗3━┿━━━━━┿━━┛ │ │ ┗3━┿━━━━━┿━━┛ │ -# └─────┘ └─────┘ └─────┘ └─────┘ -# """ - -# cg = gen_graph(n_layers=9) - -# loc = 2 - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, loc, loc, loc, 0), 0.5,), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, loc, loc, loc, 0), 0.3,), -# ], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk Z -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, loc, loc, loc, 0)], -# edges=[ -# (to_label(cg, 1, loc, loc, loc, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5,), -# (to_label(cg, 1, loc, loc, loc, 0), to_label(cg, 1, 0, 0, 0, 0), 0.3,), -# ], -# timestamp=fake_timestamp, -# ) - -# for i_layer in range(3, 10): -# if loc // 2 ** (i_layer - 3) == 1: -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# elif loc // 2 ** (i_layer - 3) == 0: -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# else: -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) -# add_layer( -# cg, -# i_layer, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# assert ( -# cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) -# == cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) -# == cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) -# ) - -# # Split -# new_root_ids = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, loc, loc, loc, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ).new_root_ids - -# # Check New State -# assert len(new_root_ids) == 1 -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] -# assert cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) == new_root_ids[0] -# leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) -# assert len(leaves) == 3 -# assert to_label(cg, 1, 0, 0, 0, 0) in leaves -# assert to_label(cg, 1, 0, 0, 0, 1) in leaves -# assert to_label(cg, 1, loc, loc, loc, 0) in leaves - -# # Check Old State still accessible -# old_root_id = cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp -# ) -# assert new_root_ids[0] != old_root_id - -# assert len(cg.get_latest_roots()) == 1 -# assert len(cg.get_latest_roots(fake_timestamp)) == 1 - -# @pytest.mark.timeout(30) -# def test_split_same_node(self, gen_graph): -# """ -# Try to remove (non-existing) edge between RG supervoxel 1 and itself -# ┌─────┐ -# │ A¹ │ -# │ 1 │ => Reject -# │ │ -# └─────┘ -# """ - -# cg = gen_graph(n_layers=2) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# res_old = cg.client._table.read_rows() -# res_old.consume_all() - -# # Split -# with pytest.raises(exceptions.PreconditionError): -# cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 0), -# mincut=False, -# ) - -# res_new = cg.client._table.read_rows() -# res_new.consume_all() - -# assert res_new.rows == res_old.rows - -# @pytest.mark.timeout(30) -# def test_split_pair_abstract_nodes(self, gen_graph): -# """ -# Try to remove (non-existing) edge between RG supervoxel 1 and abstract node "2" -# ┌─────┐ -# │ B² │ -# │ "2" │ -# │ │ -# └─────┘ -# ┌─────┐ => Reject -# │ A¹ │ -# │ 1 │ -# │ │ -# └─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Preparation: Build Chunk A -# fake_timestamp = datetime.utcnow() - timedelta(days=10) -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# # Preparation: Build Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[], -# timestamp=fake_timestamp, -# ) - -# add_layer( -# cg, -# 3, -# [0, 0, 0], -# -# time_stamp=fake_timestamp, -# n_threads=1, -# ) - -# res_old = cg.client._table.read_rows() -# res_old.consume_all() - -# # Split -# with pytest.raises(exceptions.PreconditionError): -# cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 2, 1, 0, 0, 1), -# mincut=False, -# ) - -# res_new = cg.client._table.read_rows() -# res_new.consume_all() - -# assert res_new.rows == res_old.rows - -# @pytest.mark.timeout(30) -# def test_diagonal_connections(self, gen_graph): -# """ -# Create graph with edge between RG supervoxels 1 and 2 (same chunk) -# and edge between RG supervoxels 1 and 3 (neighboring chunks) -# ┌─────┬─────┐ -# │ A¹ │ B¹ │ -# │ 2━1━┿━━3 │ -# │ / │ │ -# ┌─────┬─────┐ -# │ | │ │ -# │ 4━━┿━━5 │ -# │ C¹ │ D¹ │ -# └─────┴─────┘ -# """ - -# cg = gen_graph(n_layers=3) - -# # Chunk A -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], -# edges=[ -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), -# (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf), -# ], -# ) - -# # Chunk B -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 0, 0, 0)], -# edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], -# ) - -# # Chunk C -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 0, 1, 0, 0)], -# edges=[ -# (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 1, 1, 0, 0), inf), -# (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf), -# ], -# ) - -# # Chunk D -# create_chunk( -# cg, -# vertices=[to_label(cg, 1, 1, 1, 0, 0)], -# edges=[(to_label(cg, 1, 1, 1, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf)], -# ) - -# add_layer( -# cg, 3, [0, 0, 0], n_threads=1, -# ) - -# rr = cg.range_read_chunk(chunk_id=cg.get_chunk_id(layer=3, x=0, y=0, z=0)) -# root_ids_t0 = list(rr.keys()) - -# assert len(root_ids_t0) == 1 - -# child_ids = [] -# for root_id in root_ids_t0: -# child_ids.extend([cg.get_subgraph([root_id])], leaves_only=True) - -# new_roots = cg.remove_edges( -# "Jane Doe", -# source_ids=to_label(cg, 1, 0, 0, 0, 0), -# sink_ids=to_label(cg, 1, 0, 0, 0, 1), -# mincut=False, -# ).new_root_ids - -# assert len(new_roots) == 2 -# assert cg.get_root(to_label(cg, 1, 1, 1, 0, 0)) == cg.get_root( -# to_label(cg, 1, 0, 1, 0, 0) -# ) -# assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == cg.get_root( -# to_label(cg, 1, 0, 0, 0, 0) -# ) +class TestGraphSplit: + @pytest.mark.timeout(30) + def test_split_pair_same_chunk(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (same chunk) + Expected: Different (new) parents for RG 1 and 2 on Layer two + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1━2 │ => │ 1 2 │ + │ │ │ │ + └─────┘ └─────┘ + """ + + cg: ChunkedGraph = gen_graph(n_layers=2) + + # Preparation: Build Chunk A + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5)], + timestamp=fake_timestamp, + ) + + # Split + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 1), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 2 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( + to_label(cg, 1, 0, 0, 0, 1) + ) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 1))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 1) in leaves + + # verify old state + cg.cache = None + assert cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) == cg.get_root(to_label(cg, 1, 0, 0, 0, 1), time_stamp=fake_timestamp) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], + leaves_only=True, + ) + ) + assert len(leaves) == 2 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 0, 0, 0, 1) in leaves + + assert len(get_latest_roots(cg)) == 2 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + def test_split_nonexisting_edge(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (same chunk) + Expected: Different (new) parents for RG 1 and 2 on Layer two + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1━2 │ => │ 1━2 │ + │ | │ │ | │ + │ 3 │ │ 3 │ + └─────┘ └─────┘ + """ + cg = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 0, 0, 0, 2), to_label(cg, 1, 0, 0, 0, 1), 0.5), + ], + timestamp=fake_timestamp, + ) + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 2), + mincut=False, + ).new_root_ids + assert len(new_root_ids) == 1 + + @pytest.mark.timeout(30) + def test_split_pair_neighboring_chunks(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) + ┌─────┬─────┐ ┌─────┬─────┐ + │ A¹ │ B¹ │ │ A¹ │ B¹ │ + │ 1━━┿━━2 │ => │ 1 │ 2 │ + │ │ │ │ │ │ + └─────┴─────┘ └─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0)], + edges=[(to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 1.0)], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 1.0)], + timestamp=fake_timestamp, + ) + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 2 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( + to_label(cg, 1, 1, 0, 0, 0) + ) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 1, 0, 0, 0))], leaves_only=True + ) + ) + assert len(leaves) == 1 and to_label(cg, 1, 1, 0, 0, 0) in leaves + + # verify old state + assert cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) == cg.get_root(to_label(cg, 1, 1, 0, 0, 0), time_stamp=fake_timestamp) + leaves = np.unique( + cg.get_subgraph( + [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], + leaves_only=True, + ) + ) + assert len(leaves) == 2 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 1, 0, 0, 0) in leaves + assert len(get_latest_roots(cg)) == 2 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_verify_cross_chunk_edges(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) + ┌─────┬─────┬─────┐ ┌─────┬─────┬─────┐ + | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ + | │ 1━━┿━━3 │ => | │ 1━━┿━━3 │ + | │ | │ │ | │ │ │ + | │ 2 │ │ | │ 2 │ │ + └─────┴─────┴─────┘ └─────┴─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=4) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 1), 0.5), + ], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 2, 0, 0, 0)], + edges=[(to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf)], + timestamp=fake_timestamp, + ) + + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 3, + [1, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 4, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 1, 0, 0, 1) + ) + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 2, 0, 0, 0) + ) + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 1, 0, 0, 1), + mincut=False, + ).new_root_ids + + assert len(new_root_ids) == 2 + + svs2 = cg.get_subgraph([new_root_ids[0]], leaves_only=True) + svs1 = cg.get_subgraph([new_root_ids[1]], leaves_only=True) + len_set = {1, 2} + assert len(svs1) in len_set + len_set.remove(len(svs1)) + assert len(svs2) in len_set + + # verify new state + assert len(new_root_ids) == 2 + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) != cg.get_root( + to_label(cg, 1, 1, 0, 0, 1) + ) + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 2, 0, 0, 0) + ) + + # l2id = cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) + # cce = cg.get_atomic_cross_edges([l2id])[l2id] + # assert len(cce[3]) == 1 + # assert cce[3][0][0] == to_label(cg, 1, 1, 0, 0, 0) + # assert cce[3][0][1] == to_label(cg, 1, 2, 0, 0, 0) + + assert len(get_latest_roots(cg)) == 2 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_verify_loop(self, gen_graph): + """ + Remove edge between existing RG supervoxels 1 and 2 (neighboring chunks) + ┌─────┬────────┬─────┐ ┌─────┬────────┬─────┐ + | │ A¹ │ B¹ │ | │ A¹ │ B¹ │ + | │ 4━━1━━┿━━5 │ => | │ 4 1━━┿━━5 │ + | │ / │ | │ | │ │ | │ + | │ 3 2━━┿━━6 │ | │ 3 2━━┿━━6 │ + └─────┴────────┴─────┘ └─────┴────────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=4) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[ + to_label(cg, 1, 1, 0, 0, 0), + to_label(cg, 1, 1, 0, 0, 1), + to_label(cg, 1, 1, 0, 0, 2), + to_label(cg, 1, 1, 0, 0, 3), + ], + edges=[ + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 0), inf), + (to_label(cg, 1, 1, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 1), inf), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 2), 0.5), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 3), 0.5), + ], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 2, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 2, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), + (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 1), inf), + (to_label(cg, 1, 2, 0, 0, 1), to_label(cg, 1, 2, 0, 0, 0), 0.5), + ], + timestamp=fake_timestamp, + ) + + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 3, + [1, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + add_parent_chunk( + cg, + 4, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 1, 0, 0, 1) + ) + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 2, 0, 0, 0) + ) + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 1, 0, 0, 2), + mincut=False, + ).new_root_ids + assert len(new_root_ids) == 2 + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 1, 0, 0, 3), + mincut=False, + ).new_root_ids + assert len(new_root_ids) == 2 + + # l2id = cg.get_parent(to_label(cg, 1, 1, 0, 0, 0)) + # cce = cg.get_atomic_cross_edges([l2id]) + # assert len(cce[3]) == 1 + + assert len(get_latest_roots(cg)) == 3 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + # @pytest.mark.timeout(30) + # def test_split_pair_disconnected_chunks(self, gen_graph): + # """ + # Remove edge between existing RG supervoxels 1 and 2 (disconnected chunks) + # ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ + # │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ + # │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ + # │ │ │ │ │ │ │ │ + # └─────┘ └─────┘ └─────┘ └─────┘ + # """ + # cg: ChunkedGraph = gen_graph(n_layers=9) + # fake_timestamp = datetime.now(UTC) - timedelta(days=10) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, 0, 0, 0, 0)], + # edges=[ + # ( + # to_label(cg, 1, 0, 0, 0, 0), + # to_label(cg, 1, 7, 7, 7, 0), + # 1.0, + # ) + # ], + # timestamp=fake_timestamp, + # ) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, 7, 7, 7, 0)], + # edges=[ + # ( + # to_label(cg, 1, 7, 7, 7, 0), + # to_label(cg, 1, 0, 0, 0, 0), + # 1.0, + # ) + # ], + # timestamp=fake_timestamp, + # ) + + # add_parent_chunk( + # cg, + # 3, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 3, + # [1, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 4, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 4, + # [1, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 5, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 5, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 6, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 6, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 7, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 7, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 8, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 8, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # 9, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + + # new_roots = cg.remove_edges( + # "Jane Doe", + # source_ids=to_label(cg, 1, 7, 7, 7, 0), + # sink_ids=to_label(cg, 1, 0, 0, 0, 0), + # mincut=False, + # ).new_root_ids + + # # verify new state + # assert len(new_roots) == 2 + # assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) != cg.get_root( + # to_label(cg, 1, 7, 7, 7, 0) + # ) + # leaves = np.unique( + # cg.get_subgraph( + # [cg.get_root(to_label(cg, 1, 0, 0, 0, 0))], leaves_only=True + # ) + # ) + # assert len(leaves) == 1 and to_label(cg, 1, 0, 0, 0, 0) in leaves + # leaves = np.unique( + # cg.get_subgraph( + # [cg.get_root(to_label(cg, 1, 7, 7, 7, 0))], leaves_only=True + # ) + # ) + # assert len(leaves) == 1 and to_label(cg, 1, 7, 7, 7, 0) in leaves + + # # verify old state + # assert cg.get_root( + # to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + # ) == cg.get_root(to_label(cg, 1, 7, 7, 7, 0), time_stamp=fake_timestamp) + # leaves = np.unique( + # cg.get_subgraph( + # [cg.get_root(to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp)], + # leaves_only=True, + # ) + # ) + # assert len(leaves) == 2 + # assert to_label(cg, 1, 0, 0, 0, 0) in leaves + # assert to_label(cg, 1, 7, 7, 7, 0) in leaves + + @pytest.mark.timeout(30) + def test_split_pair_already_disconnected(self, gen_graph): + """ + Try to remove edge between already disconnected RG supervoxels 1 and 2 (same chunk). + Expected: No change, no error + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1 2 │ => │ 1 2 │ + │ │ │ │ + └─────┘ └─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[], + timestamp=fake_timestamp, + ) + res_old = cg.client._table.read_rows() + res_old.consume_all() + + with pytest.raises(exceptions.PreconditionError): + cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 1), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ) + + res_new = cg.client._table.read_rows() + res_new.consume_all() + + if res_old.rows != res_new.rows: + warn( + "Rows were modified when splitting a pair of already disconnected supervoxels." + "While probably not an error, it is an unnecessary operation." + ) + + @pytest.mark.timeout(30) + def test_split_full_circle_to_triple_chain_same_chunk(self, gen_graph): + """ + Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (same chunk) + ┌─────┐ ┌─────┐ + │ A¹ │ │ A¹ │ + │ 1━2 │ => │ 1 2 │ + │ ┗3┛ │ │ ┗3┛ │ + └─────┘ └─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[ + to_label(cg, 1, 0, 0, 0, 0), + to_label(cg, 1, 0, 0, 0, 1), + to_label(cg, 1, 0, 0, 0, 2), + ], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 2), 0.5), + (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 0, 0, 0, 2), 0.5), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.3), + ], + timestamp=fake_timestamp, + ) + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 1), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 1 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 2)) == new_root_ids[0] + leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) + assert len(leaves) == 3 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 0, 0, 0, 1) in leaves + assert to_label(cg, 1, 0, 0, 0, 2) in leaves + + # verify old state + old_root_id = cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) + assert new_root_ids[0] != old_root_id + assert len(get_latest_roots(cg)) == 1 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_full_circle_to_triple_chain_neighboring_chunks(self, gen_graph): + """ + Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (neighboring chunks) + ┌─────┬─────┐ ┌─────┬─────┐ + │ A¹ │ B¹ │ │ A¹ │ B¹ │ + │ 1━━┿━━2 │ => │ 1 │ 2 │ + │ ┗3━┿━━┛ │ │ ┗3━┿━━┛ │ + └─────┴─────┘ └─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 0, 0, 0, 1), to_label(cg, 1, 1, 0, 0, 0), 0.5), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), 0.3), + ], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[ + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), 0.3), + ], + timestamp=fake_timestamp, + ) + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + + new_root_ids = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 1, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ).new_root_ids + + # verify new state + assert len(new_root_ids) == 1 + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] + assert cg.get_root(to_label(cg, 1, 1, 0, 0, 0)) == new_root_ids[0] + leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) + assert len(leaves) == 3 + assert to_label(cg, 1, 0, 0, 0, 0) in leaves + assert to_label(cg, 1, 0, 0, 0, 1) in leaves + assert to_label(cg, 1, 1, 0, 0, 0) in leaves + + # verify old state + old_root_id = cg.get_root( + to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + ) + assert new_root_ids[0] != old_root_id + assert len(get_latest_roots(cg)) == 1 + assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + # @pytest.mark.timeout(30) + # def test_split_full_circle_to_triple_chain_disconnected_chunks(self, gen_graph): + # """ + # Remove direct edge between RG supervoxels 1 and 2, but leave indirect connection (disconnected chunks) + # ┌─────┐ ┌─────┐ ┌─────┐ ┌─────┐ + # │ A¹ │ ... │ Z¹ │ │ A¹ │ ... │ Z¹ │ + # │ 1━━┿━━━━━┿━━2 │ => │ 1 │ │ 2 │ + # │ ┗3━┿━━━━━┿━━┛ │ │ ┗3━┿━━━━━┿━━┛ │ + # └─────┘ └─────┘ └─────┘ └─────┘ + # """ + # cg: ChunkedGraph = gen_graph(n_layers=9) + # loc = 2 + # fake_timestamp = datetime.now(UTC) - timedelta(days=10) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + # edges=[ + # (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + # ( + # to_label(cg, 1, 0, 0, 0, 1), + # to_label(cg, 1, loc, loc, loc, 0), + # 0.5, + # ), + # ( + # to_label(cg, 1, 0, 0, 0, 0), + # to_label(cg, 1, loc, loc, loc, 0), + # 0.3, + # ), + # ], + # timestamp=fake_timestamp, + # ) + # create_chunk( + # cg, + # vertices=[to_label(cg, 1, loc, loc, loc, 0)], + # edges=[ + # ( + # to_label(cg, 1, loc, loc, loc, 0), + # to_label(cg, 1, 0, 0, 0, 1), + # 0.5, + # ), + # ( + # to_label(cg, 1, loc, loc, loc, 0), + # to_label(cg, 1, 0, 0, 0, 0), + # 0.3, + # ), + # ], + # timestamp=fake_timestamp, + # ) + # for i_layer in range(3, 10): + # if loc // 2 ** (i_layer - 3) == 1: + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # elif loc // 2 ** (i_layer - 3) == 0: + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # else: + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + # add_parent_chunk( + # cg, + # i_layer, + # [0, 0, 0], + # time_stamp=fake_timestamp, + # n_threads=1, + # ) + + # assert ( + # cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) + # == cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) + # == cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) + # ) + # new_root_ids = cg.remove_edges( + # "Jane Doe", + # source_ids=to_label(cg, 1, loc, loc, loc, 0), + # sink_ids=to_label(cg, 1, 0, 0, 0, 0), + # mincut=False, + # ).new_root_ids + + # # verify new state + # assert len(new_root_ids) == 1 + # assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == new_root_ids[0] + # assert cg.get_root(to_label(cg, 1, 0, 0, 0, 1)) == new_root_ids[0] + # assert cg.get_root(to_label(cg, 1, loc, loc, loc, 0)) == new_root_ids[0] + # leaves = np.unique(cg.get_subgraph([new_root_ids[0]], leaves_only=True)) + # assert len(leaves) == 3 + # assert to_label(cg, 1, 0, 0, 0, 0) in leaves + # assert to_label(cg, 1, 0, 0, 0, 1) in leaves + # assert to_label(cg, 1, loc, loc, loc, 0) in leaves + + # # verify old state + # old_root_id = cg.get_root( + # to_label(cg, 1, 0, 0, 0, 0), time_stamp=fake_timestamp + # ) + # assert new_root_ids[0] != old_root_id + + # assert len(get_latest_roots(cg)) == 1 + # assert len(get_latest_roots(cg, fake_timestamp)) == 1 + + @pytest.mark.timeout(30) + def test_split_same_node(self, gen_graph): + """ + Try to remove (non-existing) edge between RG supervoxel 1 and itself + ┌─────┐ + │ A¹ │ + │ 1 │ => Reject + │ │ + └─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=2) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0)], + edges=[], + timestamp=fake_timestamp, + ) + + res_old = cg.client._table.read_rows() + res_old.consume_all() + with pytest.raises(exceptions.PreconditionError): + cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 0), + mincut=False, + ) + + res_new = cg.client._table.read_rows() + res_new.consume_all() + assert res_new.rows == res_old.rows + + @pytest.mark.timeout(30) + def test_split_pair_abstract_nodes(self, gen_graph): + """ + Try to remove (non-existing) edge between RG supervoxel 1 and abstract node "2" + ┌─────┐ + │ B² │ + │ "2" │ + │ │ + └─────┘ + ┌─────┐ => Reject + │ A¹ │ + │ 1 │ + │ │ + └─────┘ + """ + + cg: ChunkedGraph = gen_graph(n_layers=3) + fake_timestamp = datetime.now(UTC) - timedelta(days=10) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0)], + edges=[], + timestamp=fake_timestamp, + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[], + timestamp=fake_timestamp, + ) + + add_parent_chunk( + cg, + 3, + [0, 0, 0], + time_stamp=fake_timestamp, + n_threads=1, + ) + res_old = cg.client._table.read_rows() + res_old.consume_all() + with pytest.raises((exceptions.PreconditionError, AssertionError)): + cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 2, 1, 0, 0, 1), + mincut=False, + ) + + res_new = cg.client._table.read_rows() + res_new.consume_all() + assert res_new.rows == res_old.rows + + @pytest.mark.timeout(30) + def test_diagonal_connections(self, gen_graph): + """ + Create graph with edge between RG supervoxels 1 and 2 (same chunk) + and edge between RG supervoxels 1 and 3 (neighboring chunks) + ┌─────┬─────┐ + │ A¹ │ B¹ │ + │ 2━1━┿━━3 │ + │ / │ │ + ┌─────┬─────┐ + │ | │ │ + │ 4━━┿━━5 │ + │ C¹ │ D¹ │ + └─────┴─────┘ + """ + cg: ChunkedGraph = gen_graph(n_layers=3) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1)], + edges=[ + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 1), 0.5), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 1, 0, 0, 0), inf), + (to_label(cg, 1, 0, 0, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf), + ], + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 0, 0, 0)], + edges=[(to_label(cg, 1, 1, 0, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf)], + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 0, 1, 0, 0)], + edges=[ + (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 1, 1, 0, 0), inf), + (to_label(cg, 1, 0, 1, 0, 0), to_label(cg, 1, 0, 0, 0, 0), inf), + ], + ) + create_chunk( + cg, + vertices=[to_label(cg, 1, 1, 1, 0, 0)], + edges=[(to_label(cg, 1, 1, 1, 0, 0), to_label(cg, 1, 0, 1, 0, 0), inf)], + ) + add_parent_chunk( + cg, + 3, + [0, 0, 0], + n_threads=1, + ) + + rr = cg.range_read_chunk(chunk_id=cg.get_chunk_id(layer=3, x=0, y=0, z=0)) + root_ids_t0 = list(rr.keys()) + assert len(root_ids_t0) == 1 + + child_ids = [] + for root_id in root_ids_t0: + child_ids.extend([cg.get_subgraph([root_id], leaves_only=True)]) + + new_roots = cg.remove_edges( + "Jane Doe", + source_ids=to_label(cg, 1, 0, 0, 0, 0), + sink_ids=to_label(cg, 1, 0, 0, 0, 1), + mincut=False, + ).new_root_ids + + assert len(new_roots) == 2 + assert cg.get_root(to_label(cg, 1, 1, 1, 0, 0)) == cg.get_root( + to_label(cg, 1, 0, 1, 0, 0) + ) + assert cg.get_root(to_label(cg, 1, 0, 0, 0, 0)) == cg.get_root( + to_label(cg, 1, 0, 0, 0, 0) + ) diff --git a/pychunkedgraph/utils/general.py b/pychunkedgraph/utils/general.py index 719473c6f..ac4929660 100644 --- a/pychunkedgraph/utils/general.py +++ b/pychunkedgraph/utils/general.py @@ -1,7 +1,9 @@ """ generic helper funtions """ + from typing import Sequence +from itertools import islice import numpy as np @@ -24,11 +26,15 @@ def reverse_dictionary(dictionary): def chunked(l: Sequence, n: int): - """Yield successive n-sized chunks from l.""" + """ + Yield successive n-sized chunks from l. + NOTE: Use itertools.batched from python 3.12 + """ if n < 1: n = len(l) - for i in range(0, len(l), n): - yield l[i : i + n] + it = iter(l) + while batch := tuple(islice(it, n)): + yield batch def in2d(arr1: np.ndarray, arr2: np.ndarray) -> np.ndarray: diff --git a/pychunkedgraph/utils/redis.py b/pychunkedgraph/utils/redis.py index 420a849f1..fa43c867a 100644 --- a/pychunkedgraph/utils/redis.py +++ b/pychunkedgraph/utils/redis.py @@ -19,8 +19,8 @@ REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "") REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/0" -keys_fields = ("INGESTION_MANAGER",) -keys_defaults = ("pcg:imanager",) +keys_fields = ("INGESTION_MANAGER", "JOB_TYPE") +keys_defaults = ("pcg:imanager", "pcg:job_type") Keys = namedtuple("keys", keys_fields, defaults=keys_defaults) keys = Keys() diff --git a/requirements.in b/requirements.in index 63e0b3472..1ec536a5c 100644 --- a/requirements.in +++ b/requirements.in @@ -15,6 +15,7 @@ rq<2 pyyaml cachetools werkzeug +tensorstore # PyPI only: cloud-files>=4.21.1 diff --git a/requirements.txt b/requirements.txt index 5a2f18adc..059b8fd91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -192,6 +192,8 @@ messagingclient==0.1.3 # via -r requirements.in middle-auth-client==3.16.1 # via -r requirements.in +ml-dtypes==0.3.2 + # via tensorstore multiprocess==0.70.15 # via pathos multiwrapper==0.1.1 @@ -210,11 +212,13 @@ numpy==1.26.0 # fastremap # fpzip # messagingclient + # ml-dtypes # multiwrapper # pandas # pyspng-seunglab # simplejpeg # task-queue + # tensorstore # zfpc # zmesh orderedmultidict==1.0.1 @@ -337,6 +341,8 @@ tenacity==8.2.3 # cloud-files # cloud-volume # task-queue +tensorstore==0.1.53 + # via -r requirements.in tqdm==4.66.1 # via # cloud-files diff --git a/tracker.py b/tracker.py deleted file mode 100644 index d2ae63cb3..000000000 --- a/tracker.py +++ /dev/null @@ -1,22 +0,0 @@ -import sys -from rq import Connection, Worker - -# Preload libraries from pychunkedgraph.ingest.cluster -from typing import Sequence, Tuple - -import numpy as np - -from pychunkedgraph.ingest.utils import chunk_id_str -from pychunkedgraph.ingest.manager import IngestionManager -from pychunkedgraph.ingest.common import get_atomic_chunk_data -from pychunkedgraph.ingest.ran_agglomeration import get_active_edges -from pychunkedgraph.ingest.create.atomic_layer import add_atomic_edges -from pychunkedgraph.ingest.create.abstract_layers import add_layer -from pychunkedgraph.graph.meta import ChunkedGraphMeta -from pychunkedgraph.graph.chunks.hierarchy import get_children_chunk_coords -from pychunkedgraph.utils.redis import keys as r_keys -from pychunkedgraph.utils.redis import get_redis_connection - -qs = sys.argv[1:] -w = Worker(qs, connection=get_redis_connection()) -w.work() \ No newline at end of file diff --git a/workers/mesh_worker.py b/workers/mesh_worker.py index 238bad7a9..52864a89b 100644 --- a/workers/mesh_worker.py +++ b/workers/mesh_worker.py @@ -37,9 +37,12 @@ def callback(payload): ) try: - mesh_dir = cg.meta.dataset_info["mesh"] - mesh_meta = cg.meta.dataset_info["mesh_metadata"] - cv_unsharded_mesh_dir = mesh_meta.get("unsharded_mesh_dir", "dynamic") + mesh_meta = cg.meta.custom_data["mesh"] + mesh_dir = mesh_meta["dir"] + layer = mesh_meta["max_layer"] + mip = mesh_meta["mip"] + err = mesh_meta["max_error"] + cv_unsharded_mesh_dir = mesh_meta.get("dynamic_mesh_dir", "dynamic") except KeyError: logging.warning(f"No metadata found for {cg.graph_id}; ignoring...") return @@ -48,15 +51,6 @@ def callback(payload): cg.meta.data_source.WATERSHED, mesh_dir, cv_unsharded_mesh_dir ) - try: - mesh_data = cg.meta.custom_data["mesh"] - layer = mesh_data["max_layer"] - mip = mesh_data["mip"] - err = mesh_data["max_error"] - except KeyError: - return - - logging.log(INFO_HIGH, f"remeshing {l2ids}; graph {table_id} operation {op_id}.") meshgen.remeshing( cg,