COMP5313 Assignment 2 - Task 2 题解

Assignment 2 - Task 2:Community Detection

题目理解

Task 2 要求在给定网络中识别社区结构,分析网络的模块化程度,并比较不同社区检测算法的性能。

核心目标: 1. 实现或应用社区检测算法(Girvan-Newman、Louvain 等) 2. 计算 Modularity 评估社区质量 3. 可视化社区划分结果 4. 分析不同参数对检测结果的影响


关键概念回顾

Modularity (Q)

Modularity 衡量社区划分的好坏,定义为:

\[Q = \frac{1}{2m} \sum_{ij} \left[ A_{ij} - \frac{k_i k_j}{2m} \right] \delta(c_i, c_j)\]

其中: - \(m\) = 网络总边数 - \(A_{ij}\) = 邻接矩阵元素 - \(k_i, k_j\) = 节点 \(i, j\) 的度 - \(\delta(c_i, c_j)\) = 1 如果 \(i, j\) 在同一社区,否则为 0

Interpretation: - \(Q > 0.3\):显著的社区结构 - \(Q > 0.7\):很强的社区结构 - \(Q \approx 0\):没有明显的社区结构(接近随机网络)

Girvan-Newman Algorithm

核心思想: 逐步移除边介数(betweenness)最高的边,直到网络分裂成社区。

算法步骤: 1. 计算所有边的 betweenness centrality 2. 移除 betweenness 最高的边 3. 重新计算受影响边的 betweenness 4. 重复直到满足停止条件 5. 在 dendrogram 上选择使 Modularity 最大的划分

时间复杂度: \(O(m^2 n)\),适合中等规模网络。

Louvain Algorithm

核心思想: 贪心优化 Modularity,两阶段迭代:

Phase 1: 局部移动 - 每个节点初始为独立社区 - 对每个节点,尝试移动到邻居社区,选择使 ΔQ 最大的移动 - 重复直到无法改进

Phase 2: 网络聚合 - 将每个社区压缩为超级节点 - 重新计算边权重 - 重复 Phase 1

时间复杂度: \(O(n \log n)\),适合大规模网络。


Python 实现

基础设置与数据加载

import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict, deque
import community as community_louvain # python-louvain package

# 加载网络(根据实际数据格式调整)
G = nx.read_edgelist('task2_network.txt', nodetype=int)
print(f"Network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

Girvan-Newman 实现

def edge_betweenness(G):
"""
计算所有边的 betweenness(基于 Brandes 算法)
"""
betweenness = defaultdict(float)
nodes = list(G.nodes())

for s in nodes:
# BFS 计算最短路径
S = []
P = {w: [] for w in nodes}
sigma = dict.fromkeys(nodes, 0.0)
sigma[s] = 1.0
d = dict.fromkeys(nodes, -1)
d[s] = 0
Q = deque([s])

while Q:
v = Q.popleft()
S.append(v)
for w in G.neighbors(v):
if d[w] < 0:
Q.append(w)
d[w] = d[v] + 1
if d[w] == d[v] + 1:
sigma[w] += sigma[v]
P[w].append(v)

# 累积依赖度
delta = dict.fromkeys(nodes, 0.0)
while S:
w = S.pop()
for v in P[w]:
c = (sigma[v] / sigma[w]) * (1.0 + delta[w])
if (v, w) in betweenness:
betweenness[(v, w)] += c
else:
betweenness[(w, v)] += c
delta[v] += c

# 无向图需要除以 2
for edge in betweenness:
betweenness[edge] /= 2.0

return betweenness


def girvan_newman(G, max_iter=None):
"""
Girvan-Newman 算法主流程
返回:[(partition, Q, removed_edges), ...]
"""
G_copy = G.copy()
history = []
removed_edges = []

# 初始状态:所有节点在一个社区
initial_partition = [{n for n in G_copy.nodes()}]
Q_initial = modularity(G, initial_partition)
history.append((initial_partition, Q_initial, []))

iter_count = 0
while G_copy.number_of_edges() > 0:
if max_iter and iter_count >= max_iter:
break

# 计算并移除 betweenness 最高的边
betweenness = edge_betweenness(G_copy)
if not betweenness:
break

max_edge = max(betweenness.items(), key=lambda x: x[1])[0]
G_copy.remove_edge(*max_edge)
removed_edges.append(max_edge)

# 获取连通分量作为社区划分
partition = list(nx.connected_components(G_copy))
Q = modularity(G, partition)
history.append((partition, Q, removed_edges.copy()))

iter_count += 1
if iter_count % 10 == 0:
print(f"Iteration {iter_count}: {len(partition)} communities, Q={Q:.4f}")

return history


def modularity(G, partition):
"""
计算给定社区划分的 Modularity
"""
m = G.number_of_edges()
if m == 0:
return 0.0

Q = 0.0
node_community = {}
for comm_id, community in enumerate(partition):
for node in community:
node_community[node] = comm_id

for i in G.nodes():
for j in G.nodes():
if node_community[i] == node_community[j]:
A_ij = 1 if G.has_edge(i, j) else 0
k_i = G.degree(i)
k_j = G.degree(j)
Q += (A_ij - k_i * k_j / (2 * m))

return Q / (2 * m)

Louvain 算法(使用库)

# 使用 python-louvain 库
import community as community_louvain

# 检测社区
partition_louvain = community_louvain.best_partition(G)

# 转换为社区列表格式
communities_louvain = defaultdict(list)
for node, comm_id in partition_louvain.items():
communities_louvain[comm_id].append(node)
partition_list = [set(nodes) for nodes in communities_louvain.values()]

# 计算 Modularity
Q_louvain = community_louvain.modularity(partition_louvain, G)
print(f"Louvain: {len(partition_list)} communities, Q={Q_louvain:.4f}")

可视化

def visualize_communities(G, partition, title="Community Structure", 
figsize=(12, 10), save_path=None):
"""
可视化社区结构
"""
plt.figure(figsize=figsize)

# 为每个社区分配颜色
colors = plt.cm.Set3(np.linspace(0, 1, len(partition)))
node_color_map = {}
for comm_id, community in enumerate(partition):
for node in community:
node_color_map[node] = colors[comm_id]

# 使用 spring layout
pos = nx.spring_layout(G, k=1/np.sqrt(G.number_of_nodes()), iterations=50, seed=42)

# 绘制边
nx.draw_networkx_edges(G, pos, alpha=0.3, edge_color='gray', width=0.8)

# 绘制节点(按社区着色)
for comm_id, community in enumerate(partition):
community_nodes = list(community)
nx.draw_networkx_nodes(G, pos, nodelist=community_nodes,
node_color=[colors[comm_id]],
node_size=100 + 50 * len(community_nodes),
alpha=0.9, label=f"Community {comm_id+1}")

# 添加标签(可选,小网络时启用)
if G.number_of_nodes() <= 50:
nx.draw_networkx_labels(G, pos, font_size=8)

plt.title(title, fontsize=16)
plt.legend(loc='best', fontsize=10)
plt.axis('off')

if save_path:
plt.savefig(save_path, dpi=300, bbox_inches='tight')
print(f"Saved to {save_path}")

plt.show()


# 绘制不同算法结果对比
fig, axes = plt.subplots(1, 2, figsize=(20, 9))

# Girvan-Newman 最佳结果
best_gn = max(history, key=lambda x: x[1])
partition_gn = best_gn[0]
Q_gn = best_gn[1]

# 绘制 GN 结果
plt.sca(axes[0])
colors_gn = plt.cm.Set3(np.linspace(0, 1, len(partition_gn)))
pos = nx.spring_layout(G, seed=42)
for comm_id, community in enumerate(partition_gn):
nx.draw_networkx_nodes(G, pos, nodelist=list(community),
node_color=[colors_gn[comm_id]],
node_size=150, alpha=0.9)
nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.8)
axes[0].set_title(f'Girvan-Newman: {len(partition_gn)} communities, Q={Q_gn:.4f}', fontsize=14)
axes[0].axis('off')

# 绘制 Louvain 结果
plt.sca(axes[1])
partition_louvain_dict = {}
for node, comm_id in partition_louvain.items():
partition_louvain_dict[node] = comm_id
colors_louvain = [plt.cm.Set3(partition_louvain_dict[node] / max(partition_louvain.values()))
for node in G.nodes()]
nx.draw_networkx_nodes(G, pos, node_color=colors_louvain, node_size=150, alpha=0.9)
nx.draw_networkx_edges(G, pos, alpha=0.2, width=0.8)
axes[1].set_title(f'Louvain: {len(set(partition_louvain.values()))} communities, Q={Q_louvain:.4f}', fontsize=14)
axes[1].axis('off')

plt.tight_layout()
plt.savefig('community_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

结果分析

Modularity 随迭代变化(Girvan-Newman)

# 绘制 Q 值变化曲线
iterations = range(len(history))
Q_values = [h[1] for h in history]
n_communities = [len(h[0]) for h in history]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(iterations, Q_values, 'b-', linewidth=2)
ax1.axvline(x=np.argmax(Q_values), color='r', linestyle='--', label=f'Best Q at iter {np.argmax(Q_values)}')
ax1.set_xlabel('Iterations (edges removed)')
ax1.set_ylabel('Modularity Q')
ax1.set_title('Modularity vs. Iterations')
ax1.legend()
ax1.grid(True, alpha=0.3)

ax2.plot(iterations, n_communities, 'g-', linewidth=2)
ax2.set_xlabel('Iterations (edges removed)')
ax2.set_ylabel('Number of Communities')
ax2.set_title('Community Count vs. Iterations')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('modularity_evolution.png', dpi=300)
plt.show()

print(f"Maximum Q = {max(Q_values):.4f} at iteration {np.argmax(Q_values)}")
print(f"Optimal number of communities: {n_communities[np.argmax(Q_values)]}")

社区统计特征

def analyze_communities(G, partition):
"""
分析社区的各种统计特征
"""
stats = []
for comm_id, community in enumerate(partition):
subgraph = G.subgraph(community)
n_nodes = len(community)
n_edges = subgraph.number_of_edges()
density = nx.density(subgraph) if n_nodes > 1 else 0
avg_degree = 2 * n_edges / n_nodes if n_nodes > 0 else 0

# 社区内部的 clustering coefficient
avg_clustering = nx.average_clustering(subgraph) if n_nodes > 2 else 0

stats.append({
'community_id': comm_id + 1,
'n_nodes': n_nodes,
'n_edges': n_edges,
'density': density,
'avg_degree': avg_degree,
'avg_clustering': avg_clustering
})

import pandas as pd
df = pd.DataFrame(stats)
print(df)
return df

# 分析最佳 GN 划分
stats_gn = analyze_communities(G, partition_gn)

# 分析 Louvain 划分
partition_louvain_sets = [set() for _ in range(max(partition_louvain.values()) + 1)]
for node, comm_id in partition_louvain.items():
partition_louvain_sets[comm_id].add(node)
partition_louvain_sets = [s for s in partition_louvain_sets if s] # 去除空集合

stats_louvain = analyze_communities(G, partition_louvain_sets)

完整运行流程

def main():
# 1. 加载数据
G = nx.read_edgelist('task2_network.txt', nodetype=int)
print(f"Loaded network: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# 2. Girvan-Newman 算法
print("\n=== Running Girvan-Newman ===")
history = girvan_newman(G, max_iter=100)
best_idx = max(range(len(history)), key=lambda i: history[i][1])
best_partition_gn, best_Q_gn, _ = history[best_idx]
print(f"Best GN: {len(best_partition_gn)} communities, Q={best_Q_gn:.4f}")

# 3. Louvain 算法
print("\n=== Running Louvain ===")
partition_louvain = community_louvain.best_partition(G)
Q_louvain = community_louvain.modularity(partition_louvain, G)
n_comm_louvain = len(set(partition_louvain.values()))
print(f"Louvain: {n_comm_louvain} communities, Q={Q_louvain:.4f}")

# 4. 可视化对比
visualize_comparison(G, best_partition_gn, partition_louvain,
best_Q_gn, Q_louvain)

# 5. 保存结果
with open('task2_results.txt', 'w') as f:
f.write(f"Girvan-Newman:\n")
f.write(f" Communities: {len(best_partition_gn)}\n")
f.write(f" Modularity Q: {best_Q_gn:.4f}\n")
for i, comm in enumerate(best_partition_gn, 1):
f.write(f" Community {i}: {sorted(comm)}\n")

f.write(f"\nLouvain:\n")
f.write(f" Communities: {n_comm_louvain}\n")
f.write(f" Modularity Q: {Q_louvain:.4f}\n")

print("\nResults saved to task2_results.txt")

if __name__ == '__main__':
main()

常见问题与优化

Q1: Girvan-Newman 太慢怎么办?

优化策略: 1. 限制迭代次数:设置 max_iter 提前停止 2. 采样计算 betweenness:对大规模网络,随机采样节点计算 3. 使用近似算法:NetworkX 的 nx.edge_betweenness_centrality()k 参数可采样 4. 换用 Louvain:对于 \(n > 1000\) 的网络,Louvain 更适合

# 使用 NetworkX 内置的近似 betweenness(更快)
betweenness = nx.edge_betweenness_centrality(G, k=min(100, G.number_of_nodes()))

Q2: 如何确定最优社区数量?

方法: - Modularity 最大化:最直接的方法 - 肘部法则:Plot Q vs. #communities,找拐点 - 分辨率参数:Louvain 可调整 resolution 参数控制社区粒度

# Louvain 不同 resolution
for gamma in [0.5, 1.0, 1.5, 2.0]:
partition = community_louvain.best_partition(G, resolution=gamma)
Q = community_louvain.modularity(partition, G)
print(f"Resolution {gamma}: {len(set(partition.values()))} communities, Q={Q:.4f}")

Q3: 社区划分不稳定怎么办?

原因: 算法随机性或网络本身模糊的结构。

解决方案: 1. 多次运行取平均:集成多个划分结果 2. 使用确定性的初始化:设置随机种子 3. 考虑重叠社区:使用 OSLOM 或 COPRA 等支持重叠社区的算法


评分要点

根据 Assignment 2 常见评分标准,确保涵盖:

评分项 权重 检查点
算法实现 30% 代码正确性、效率、可读性
Modularity 计算 20% 正确实现公式,结果合理
结果分析 25% 社区统计、比较、可视化
报告质量 15% 结构清晰、解释充分
创新/优化 10% 额外分析、算法改进

加分项: - 对比 3+ 种不同算法 - 分析 resolution parameter 的影响 - 对结果进行统计显著性检验 - 实现可视化交互(如使用 Plotly)


参考资源

  1. NetworkX 文档: https://networkx.org/documentation/stable/
  2. python-louvain: https://github.com/taynaud/python-louvain
  3. 论文: Newman, M. E. J. (2004). "Fast algorithm for detecting community structure in networks"
  4. 论文: Blondel et al. (2008). "Fast unfolding of communities in large networks"

Last Updated: 2026-04-19
Status: ✅ Complete