Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds support for jaccard_coefficient #62

Merged
merged 12 commits into from
Jan 30, 2025
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ Below is the list of algorithms that are currently supported in nx-cugraph.
│ └─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.hits_alg.hits.html#networkx.algorithms.link_analysis.hits_alg.hits">hits</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/link_analysis.html#module-networkx.algorithms.link_analysis.pagerank_alg">pagerank_alg</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank">pagerank</a>
<a href="https://networkx.org/documentation/stable/reference/algorithms/link_prediction.html#module-networkx.algorithms.link_prediction">link_prediction</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.link_prediction.jaccard_coefficient.html#networkx.algorithms.link_prediction.jaccard_coefficient">jaccard_coefficient</a>
<a href="https://networkx.org/documentation/stable/reference/algorithms/operators.html">operators</a>
└─ <a href="https://networkx.org/documentation/stable/reference/algorithms/operators.html#module-networkx.algorithms.operators.unary">unary</a>
├─ <a href="https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.operators.unary.complement.html#networkx.algorithms.operators.unary.complement">complement</a>
Expand Down
3 changes: 2 additions & 1 deletion _nx_cugraph/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -110,6 +110,7 @@
"is_tree",
"is_weakly_connected",
"isolates",
"jaccard_coefficient",
"k_truss",
"karate_club_graph",
"katz_centrality",
Expand Down
35 changes: 34 additions & 1 deletion benchmarks/pytest-based/bench_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,11 @@ def get_graph_obj_for_benchmark(graph_obj, backend_wrapper):
"""
G = graph_obj
if backend_wrapper.backend_name == "cugraph-preconverted":
G = nxcg.from_networkx(G, preserve_all_attrs=True)
G = nxcg.from_networkx(
G,
preserve_all_attrs=True,
use_compat_graph=True,
)
return G


Expand Down Expand Up @@ -898,6 +902,35 @@ def bench_bipartite_BC_n1000_m3000_k100000(benchmark, backend_wrapper):
assert type(result) is dict


def bench_jaccard(benchmark, graph_obj, backend_wrapper):
G = get_graph_obj_for_benchmark(graph_obj, backend_wrapper)

# ebunch is a list of node pairs to limit the jaccard run.
nodes = list(G.nodes)
start = nodes[0]
ebunch = [(start, n) for n in nodes[1:]]
start = nodes[1]
ebunch += [(start, n) for n in nodes[2:]]
start = nodes[2]
ebunch += [(start, n) for n in nodes[3:]]

# DiGraphs are not supported
if G.is_directed():
G = G.to_undirected()

result = benchmark.pedantic(
target=backend_wrapper(nx.jaccard_coefficient, force_unlazy_eval=True),
args=(G,),
kwargs=dict(
ebunch=ebunch,
),
rounds=rounds,
iterations=iterations,
warmup_rounds=warmup_rounds,
)
assert type(result) is list


@pytest.mark.skip(reason="benchmark not implemented")
def bench_complete_bipartite_graph(benchmark, graph_obj, backend_wrapper):
pass
Expand Down
4 changes: 3 additions & 1 deletion nx_cugraph/algorithms/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -17,6 +17,7 @@
community,
components,
link_analysis,
link_prediction,
operators,
shortest_paths,
traversal,
Expand All @@ -30,6 +31,7 @@
from .dag import *
from .isolate import *
from .link_analysis import *
from .link_prediction import *
from .operators import *
from .reciprocity import *
from .shortest_paths import *
Expand Down
80 changes: 80 additions & 0 deletions nx_cugraph/algorithms/link_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import cupy as cp
import networkx as nx
import pylibcugraph as plc

from nx_cugraph.convert import _to_undirected_graph
from nx_cugraph.utils import index_dtype, networkx_algorithm, not_implemented_for

__all__ = [
"jaccard_coefficient",
]


@not_implemented_for("directed")
@not_implemented_for("multigraph")
@networkx_algorithm(version_added="25.02", _plc="jaccard_coefficients")
def jaccard_coefficient(G, ebunch=None):
G = _to_undirected_graph(G)

# If ebunch is not specified, create pairs representing all non-edges.
# This can be an extremely large set and is not realistic for large graphs,
# but this is required for NX compatibility.
if ebunch is None:
A = cp.tri(G._N, G._N, dtype=bool)
A[G.src_indices, G.dst_indices] = True
u_indices, v_indices = cp.nonzero(~A)
if u_indices.size == 0:
return iter([])
u_indices = u_indices.astype(index_dtype)
v_indices = v_indices.astype(index_dtype)

else:
(u, v) = zip(*ebunch)
try:
# Convert the ebunch lists to cupy arrays for passing to PLC, possibly
# mapping to integers if the Graph was renumbered.
# Allow the Graph renumber lookup (if renumbering was done) to check
# for invalid node IDs in ebunch.
u_indices = G._list_to_nodearray(u)
v_indices = G._list_to_nodearray(v)
except KeyError as n:
raise nx.NodeNotFound(f"Node {n} not in G.")

# If G was not renumbered, then the ebunch nodes must be explicitly
# checked. If not done, plc.jaccard_coefficients() will accept node IDs
# not in the graph and return a coefficient of 0 for them, which is not
# compatible with NX.
if (not hasattr(G, "key_to_id") or G.key_to_id is None) and (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for hasattr here; G is converted above, and we use this convention (converting to CudaGraph) heavily throughout the code.

Suggested change
if (not hasattr(G, "key_to_id") or G.key_to_id is None) and (
if G.key_to_id is None and (

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up refactoring the ebunch node check in order to pass some tests added to ensure we behave like NX. The change consolidates the additional valid node checks in _list_to_nodearray, but let me know if you see any issues.

(n := u_indices.max()) >= G._N
or (n := v_indices.max()) >= G._N
or (n := u_indices.min()) < 0
or (n := v_indices.min()) < 0
):
raise nx.NodeNotFound(f"Node {n} not in G.")

(u, v, p) = plc.jaccard_coefficients(
resource_handle=plc.ResourceHandle(),
graph=G._get_plc_graph(),
first=u_indices,
second=v_indices,
use_weight=False,
do_expensive_check=False,
)

u = G._nodearray_to_list(u)
v = G._nodearray_to_list(v)
p = p.tolist()

return zip(u, v, p)
7 changes: 6 additions & 1 deletion nx_cugraph/classes/graph.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2023-2024, NVIDIA CORPORATION.
# Copyright (c) 2023-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand Down Expand Up @@ -1009,6 +1009,11 @@ def _get_plc_graph(
dst_indices = self.dst_indices
if switch_indices:
src_indices, dst_indices = dst_indices, src_indices

# FIXME: the SGGraph constructor arg "symmetrize" will perform all
# symmetrization steps required by libcugraph. The edge_array check
# should be kept, but all other code in this `if` block should be
# removed if possible.
if symmetrize is not None:
Comment on lines +1013 to +1016
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

symmetrize= was added in 24.10 here, rapidsai/cugraph#4649, so I think it makes a lot of sense to investigate using it and removing some code. Note that symmetrize here can be "union" and "intersection", but I think PLC only does "union", so we'd still need virtually all the code here. Perhaps we could use _get_int_dtype to determine what dtype we should cast to to make this more efficient. I'm also curious how the performance of this code compares to symmetrizing in PLC.

if edge_array is not None:
raise NotImplementedError(
Expand Down
31 changes: 31 additions & 0 deletions nx_cugraph/tests/test_link_prediction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections.abc import Iterable

import networkx as nx
import pytest

# The tests in this file cover use cases unique to nx-cugraph. If the coverage
# here is not unique to nx-cugraph, consider moving those tests to the NetworkX
# project.


def test_no_nonexistent_edges_no_ebunch():
"""Test no ebunch and G is fully connected

Ensure function returns iter([]) or equivalent due to no nonexistent edges.
"""
G = nx.complete_graph(5)
result = nx.jaccard_coefficient(G)
assert isinstance(result, Iterable)
assert pytest.raises(StopIteration, next, result)
Loading