Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Greedy modularity optimization community detection algorithm #314

Open
wants to merge 21 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
c752c02
add greedy modularity
olegfafurin Nov 15, 2023
322cf70
fix algo
olegfafurin Nov 21, 2023
fee65af
remove debug output
olegfafurin Nov 21, 2023
f7c1652
add karate club test
olegfafurin Nov 21, 2023
d2d92e8
add weighted and type-generic modularity optimization
olegfafurin Nov 28, 2023
58fcc8c
slow algo: bugfixes, performance optimization, tests with SBM
olegfafurin Dec 6, 2023
40a381b
fix and add empty graph case
olegfafurin Dec 6, 2023
5412d41
remove history of modularity opt steps + add sparse matrix
olegfafurin Dec 13, 2023
ee65d63
Rename and export
gdalle Dec 13, 2023
6200ecf
Revert "Rename and export"
olegfafurin Dec 26, 2023
4fe505d
Revert "remove history of modularity opt steps + add sparse matrix"
olegfafurin Dec 26, 2023
bb85ea6
fix import and random test for old algo
olegfafurin Dec 28, 2023
353c6a4
buggy fast modularity implementation with PriorityQueue
olegfafurin Jan 3, 2024
4615d67
return history of modularity along with best community partitioning
olegfafurin Jan 4, 2024
cf80259
Revert "return history of modularity along with best community partit…
olegfafurin Jan 4, 2024
888ae41
return history of modularity along with best partitioning
olegfafurin Jan 4, 2024
019612a
fix edge case with no edges between communities left
olegfafurin Jan 4, 2024
0515202
add fast algorithm working (mind the precision)
olegfafurin Jan 22, 2024
e777f07
Fix type instability
gdalle Jan 22, 2024
8d42973
Merge branch 'greedy-modularity-fast' into greedy-modularity
olegfafurin Jan 22, 2024
ac77ee5
export fast algo
olegfafurin Jan 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/Graphs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ using DataStructures:
union!,
find_root!,
BinaryMaxHeap,
BinaryMinHeap
BinaryMinHeap,
DefaultDict
using LinearAlgebra: I, Symmetric, diagm, eigen, eigvals, norm, rmul!, tril, triu
import LinearAlgebra: Diagonal, issymmetric, mul!
using Random:
Expand Down Expand Up @@ -307,6 +308,8 @@ export

# community
modularity,
community_detection_greedy_modularity,
community_detection_greedy_modularity_fast,
core_periphery_deg,
local_clustering,
local_clustering_coefficient,
Expand Down Expand Up @@ -518,6 +521,8 @@ include("centrality/eigenvector.jl")
include("centrality/radiality.jl")
include("community/modularity.jl")
include("community/label_propagation.jl")
include("community/greedy_modularity.jl")
include("community/greedy_modularity_fast.jl")
include("community/core-periphery.jl")
include("community/clustering.jl")
include("community/cliques.jl")
Expand Down
106 changes: 106 additions & 0 deletions src/community/greedy_modularity.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
function community_detection_greedy_modularity(g::AbstractGraph; weights::AbstractMatrix=weights(g))
if is_directed(g)
throw(ArgumentError("The graph must not be directed"))
end
n = nv(g)
c = Vector{Int}(1:n)
cs = Vector{Vector{Int}}(undef, n)
olegfafurin marked this conversation as resolved.
Show resolved Hide resolved
T = float(eltype(weights))
qs = Vector{T}(undef, n)
olegfafurin marked this conversation as resolved.
Show resolved Hide resolved
Q, e, a = compute_modularity(g, c, weights)
m = sum(a)
cs[1] = copy(c)
qs[1] = Q
for i in 2:n
Q = modularity_greedy_step!(g, Q, e, a, c, m)
cs[i] = copy(c)
qs[i] = Q
end
imax = argmax(qs)
return rewrite_class_ids(cs[imax]), qs
end

function modularity_greedy_step!(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Test that this loop does not allocate

g::AbstractGraph,
Q::T,
e::AbstractMatrix{T},
a::AbstractVector{T},
c::AbstractVector{<:Integer},
m::T
) where {T}
n = nv(g)
dq_max::typeof(Q) = typemin(Q)
to_merge::Tuple{Int,Int} = (0, 0)
olegfafurin marked this conversation as resolved.
Show resolved Hide resolved
for edge in edges(g)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

investigate the case of self-loops

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Self-loops indeed appear in modularity computation (in a correct way)
They do participate in modularity optimization step and they can also impact which merge is optimal. Yet we never merge a cluster with itself as we check that ends of an edge belong to different clusters at traversal stage.

u, v = src(edge), dst(edge)
if c[u] != c[v]
dq = (e[c[u], c[v]] / m - a[c[u]] * a[c[v]] / m^2)
if dq > dq_max
dq_max = dq
to_merge = (c[u], c[v])
end
end
end
if to_merge == (0,0)
for i in vertices(g)
if c[i] != c[1]
to_merge = (c[1], c[i])
break
end
end
end
c1, c2 = to_merge
for i in 1:n
e[c1, i] += e[c2, i]
end
for i in 1:n
if i == c2
continue
end
e[i, c1] += e[i, c2]
end
a[c1] = a[c1] + a[c2]
for i in 1:n
if c[i] == c2
c[i] = c1
end
end
return Q + 2 * dq_max
end

function compute_modularity(g::AbstractGraph, c::AbstractVector{<:Integer}, w::AbstractArray)
modularity_type = float(eltype(w))
Q = zero(modularity_type)
m = sum(w[src(e), dst(e)] for e in edges(g); init=Q) * 2
n_groups = maximum(c)
a = zeros(modularity_type, n_groups)
e = zeros(modularity_type, n_groups, n_groups)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we try a dict or sparse matrix here?

m == 0 && return 0.0, e, a
for u in vertices(g)
for v in neighbors(g, u)
if c[u] == c[v]
Q += w[u, v]
end
e[c[u], c[v]] += w[u, v]
a[c[u]] += w[u, v]
end
end
Q *= m
for i in 1:n_groups
Q -= a[i]^2
end
Q /= m^2
return Q, e, a
end

function rewrite_class_ids(v::AbstractVector{<:Integer})
d = Dict{Int, Int}()
vn = zeros(Int64, length(v))
for i in eachindex(v)
if !(v[i] in keys(d))
d[v[i]] = length(d) + 1
end
vn[i] = d[v[i]]
end
return vn
end
127 changes: 127 additions & 0 deletions src/community/greedy_modularity_fast.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
function community_detection_greedy_modularity_fast(g::AbstractGraph; weights::AbstractMatrix=weights(g))
if is_directed(g)
throw(ArgumentError("The graph must not be directed"))
end
n = nv(g)
c = Vector{Int}(1:n)
dq_dict, dq_heap, dq_global_heap, a, m = compute_dq(g, c, weights)
modularity_type = float(eltype(weights))
empty_row_heap = PriorityQueue{Tuple{Int, Int}, Tuple{modularity_type, Tuple{Int, Int}}}(Base.Order.Reverse) # placeholder, lasts empty forever
while length(dq_global_heap) > 1
(u,v), (dq, _) = dequeue_pair!(dq_global_heap)
if dq <= zero(modularity_type)
return rewrite_class_ids(c)
end
dequeue!(dq_heap[u])
if !isempty(dq_heap[u])
enqueue!(dq_global_heap, peek(dq_heap[u]))
end
if peek(dq_heap[v])[1] == (v,u)
dequeue!(dq_heap[v])
delete!(dq_global_heap, (v,u))
if !isempty(dq_heap[v])
enqueue!(dq_global_heap, peek(dq_heap[v]))
end
else
delete!(dq_heap[v], (v,u))
end

c[c .== u] .= v

neighbors_u = setdiff(keys(dq_dict[u]), v)
neighbors_v = setdiff(keys(dq_dict[v]), u)
neighbors_all = union(neighbors_u, neighbors_v)
neighbors_common = intersect(neighbors_u, neighbors_v)

for w in neighbors_all
if w in neighbors_common
dq_w = dq_dict[v][w] + dq_dict[u][w]
elseif w in neighbors_v
dq_w = dq_dict[v][w] - a[u] * a[w] / m^2
else
dq_w = dq_dict[u][w] - a[v] * a[w] / m^2
end
for (row, column) in ((v, w), (w, v))
dq_heap_row = dq_heap[row]
dq_dict[row][column] = dq_w
if !isempty(dq_heap_row)
oldmax = peek(dq_heap_row)
else
oldmax = nothing
end
dq_heap_row[(row,column)] = (dq_w, (-row, -column)) # update or insert
if isnothing(oldmax)
dq_global_heap[(row, column)] = (dq_w, (-row, -column))
else
newmax = peek(dq_heap_row)
if newmax != oldmax
delete!(dq_global_heap, oldmax[1]) ## is it still there?
enqueue!(dq_global_heap, newmax)
end
end
end
end

for (w, _) in dq_dict[u]
delete!(dq_dict[w], u)
if w != v
for (row, column) in ((w,u), (u,w))
dq_heap_row = dq_heap[row]
if peek(dq_heap_row)[1] == (row, column)
dequeue!(dq_heap_row)
delete!(dq_global_heap, (row, column))
if !isempty(dq_heap_row)
enqueue!(dq_global_heap, peek(dq_heap_row))
end
else
delete!(dq_heap_row, (row, column))
end
end
end
end
delete!(dq_dict, u)
dq_heap[u] = empty_row_heap
a[v] += a[u]
a[u] = 0
end
return rewrite_class_ids(c)
end

function compute_dq(
g::AbstractGraph, c::AbstractVector{<:Integer}, w::AbstractArray
)
modularity_type = float(eltype(w))
Q_zero = zero(modularity_type)
m = sum(w[src(e), dst(e)] for e in edges(g); init=Q_zero) * 2
n_groups = maximum(c)
a = zeros(modularity_type, n_groups)

typical_dict = DefaultDict{Int, modularity_type}(Q_zero)
dq_dict = Dict{Int,typeof(typical_dict)}()
for v in vertices(g)
dq_dict[v] = DefaultDict{Int, modularity_type}(Q_zero)
end

for u in vertices(g)
for v in neighbors(g, u)
dq_dict[u][v] += w[u,v]
a[c[u]] += w[u, v]
end
end

for (u, dct) in dq_dict
for (v, w) in dct
dq_dict[u][v] = w / m - a[c[u]] * a[c[v]] / m^2
end
end

typical_queue = PriorityQueue{Tuple{Int, Int}, Tuple{modularity_type, Tuple{Int, Int}}}(Base.Order.Reverse)
dq_heap = Dict{Int,typeof(typical_queue)}()
for u in vertices(g)
dq_heap[u] = PriorityQueue{Tuple{Int, Int}, Tuple{modularity_type, Tuple{Int, Int}}}(Base.Order.Reverse, (u, v) => (dq, (-u, -v)) for (v, dq) in dq_dict[u])
end

v_connected = filter(v -> !isempty(dq_heap[v]), vertices(g))
global_heap = PriorityQueue{Tuple{Int, Int}, Tuple{modularity_type, Tuple{Int, Int}}}(Base.Order.Reverse, peek(dq_heap[v]) for v in v_connected)
return dq_dict, dq_heap, global_heap, a, m
end
114 changes: 114 additions & 0 deletions test/community/greedy_modularity.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
@testset "Greedy modularity: karate club" begin
g = smallgraph(:karate)

olegfafurin marked this conversation as resolved.
Show resolved Hide resolved
expected_c_w = [1, 2, 2, 2, 1, 1, 1, 2, 3, 2, 1, 1, 2, 2, 3, 3, 1, 2, 3, 1, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
expected_q_w = 0.3806706114398422

c_w, _ = community_detection_greedy_modularity(g)

@test c_w == expected_c_w

@test modularity(g, c_w) ≈ expected_q_w
end

@testset "Greedy modularity: weighted karate club" begin
g = smallgraph(:karate)
w = [
0 4 5 3 3 3 3 2 2 0 2 3 1 3 0 0 0 2 0 2 0 2 0 0 0 0 0 0 0 0 0 2 0 0;
4 0 6 3 0 0 0 4 0 0 0 0 0 5 0 0 0 1 0 2 0 2 0 0 0 0 0 0 0 0 2 0 0 0;
5 6 0 3 0 0 0 4 5 1 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2 0 0 0 2 0;
3 3 3 0 0 0 0 3 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
3 0 0 0 0 0 2 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
3 0 0 0 0 0 5 0 0 0 3 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
3 0 0 0 2 5 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
2 4 4 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
2 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 3 4;
0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2;
2 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
1 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
3 5 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 2;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 4;
0 0 0 0 0 3 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2;
2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 1;
2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 3;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 0 4 0 3 0 0 5 4;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 3 0 0 0 2 0 0;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 2 0 0 0 0 0 0 7 0 0;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 2;
0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 3 0 0 0 0 0 0 0 0 4;
0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2;
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 4 0 0 0 0 0 4 2;
0 2 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3;
2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 7 0 0 2 0 0 0 4 4;
0 0 2 0 0 0 0 0 3 0 0 0 0 0 3 3 0 0 1 0 3 0 2 5 0 0 0 0 0 4 3 4 0 5;
0 0 0 0 0 0 0 0 4 2 0 0 0 3 2 4 0 0 2 1 1 0 3 4 0 0 2 4 2 2 3 4 5 0
]
expected_c_w = [1, 1, 1, 1, 2, 2, 2, 1, 3, 3, 2, 1, 1, 1, 3, 3, 2, 1, 3, 1, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
expected_q_w = 0.4345214669889994

c_w, _ = community_detection_greedy_modularity(g, weights=w)
@test c_w == expected_c_w
@test modularity(g,c_w, distmx=w) ≈ expected_q_w
end


@testset "Greedy modularity: disconnected graph" begin
olegfafurin marked this conversation as resolved.
Show resolved Hide resolved
g = SimpleGraph(10)
for i=1:5
add_edge!(g, 2*i - 1, 2*i)
end
c, _ = community_detection_greedy_modularity(g)
q = modularity(g, c)

expected_c = [1,1,2,2,3,3,4,4,5,5]
expected_q = 0.8

@test c == expected_c
@test q ≈ expected_q
end


@testset "Greedy modularity: complete graph" begin
g = complete_graph(10)
c, _ = community_detection_greedy_modularity(g)
q = modularity(g, c)

expected_c = ones(Int, 10)
expected_q = 0.0

@test c == expected_c
@test q ≈ expected_q
end


@testset "Greedy modularity: empty graph" begin
g = SimpleGraph(10)
c, _ = community_detection_greedy_modularity(g)
q = modularity(g, c)

expected_c = Vector(1:10)
expected_q = 0.0

@test c == expected_c
@test q ≈ expected_q
end


@testset "Greedy modularity: random stochastic block model graph" begin
g_sbm = stochastic_block_model(99,1,[500,1000])
expected_c = [i > 500 ? 2 : 1 for i=1:1500]

c, _ = community_detection_greedy_modularity(g_sbm)

matches1 = sum(c .== expected_c)
matches2 = sum((3 .- c) .== expected_c) # complementary cluster numbers assignment

@test matches1 ≥ 0.95 * nv(g_sbm) || matches2 ≥ 0.95 * nv(g_sbm)
end