COMP5313 Assignment 2 - Task 1 题解

Assignment 2 - Task 1:Link Prediction

题目理解

Task 1 要求在给定网络中实现并评估多种链接预测算法,预测未来可能出现的边。

核心目标: 1. 实现多种基于节点相似度的链接预测算法 2. 在训练/测试集上评估预测性能 3. 比较不同算法的准确率、精确率、召回率等指标 4. 分析网络结构特征对预测效果的影响


关键概念回顾

链接预测的定义

给定网络 \(G = (V, E)\),链接预测问题旨在预测未来可能出现的边 \((u, v) \notin E\)

两种评估方式: 1. 时序分割:用时间 \(t\) 前的边训练,\(t\) 后的边测试 2. 随机分割:随机划分边为训练集和测试集

基于节点相似度的指标

\(\Gamma(u)\) 为节点 \(u\) 的邻居集合。

1. Common Neighbors (CN)

\[CN(u, v) = |\Gamma(u) \cap \Gamma(v)|\]

最简单的指标:共同邻居越多,越可能相连。

2. Jaccard Coefficient (JC)

\[JC(u, v) = \frac{|\Gamma(u) \cap \Gamma(v)|}{|\Gamma(u) \cup \Gamma(v)|}\]

归一化后的共同邻居,消除度的影响。

3. Adamic-Adar Index (AA)

\[AA(u, v) = \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{\log k_w}\]

对高度数节点惩罚(高度数节点的信息量较少)。

4. Preferential Attachment (PA)

\[PA(u, v) = k_u \times k_v\]

度越高的节点越容易获得新连接(富者愈富)。

5. Resource Allocation (RA)

\[RA(u, v) = \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{k_w}\]

与 AA 类似但惩罚更强(\(1/k_w\) vs \(1/\log k_w\))。

评估指标

指标 定义 适用场景
AUC 随机未连接边得分 < 随机连接边得分的概率 整体排序能力
Precision@K 前 K 个预测中正确的比例 Top-K 应用
Recall 正确预测数 / 总实际新增边数 覆盖率
F1-Score 2 × Precision × Recall / (Precision + Recall) 综合性能

Python 实现

基础设置与数据加载

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import auc, precision_recall_curve, roc_curve
from collections import Counter
import random
import time

# 设置随机种子保证可复现性
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


def load_network(filepath):
"""
加载网络数据
支持格式:边列表 (每行 "u v")
"""
G = nx.read_edgelist(filepath, nodetype=int, create_using=nx.Graph())
print(f"Loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
return G


def train_test_split(G, test_ratio=0.2, seed=42):
"""
随机划分训练集和测试集
"""
random.seed(seed)

# 获取所有边
all_edges = list(G.edges())
random.shuffle(all_edges)

# 划分点
n_test = int(len(all_edges) * test_ratio)
test_edges = all_edges[:n_test]
train_edges = all_edges[n_test:]

# 构建训练网络
G_train = G.copy()
G_train.remove_edges_from(test_edges)

print(f"Train edges: {len(train_edges)}, Test edges: {len(test_edges)}")
print(f"Train network: {G_train.number_of_nodes()} nodes, {G_train.number_of_edges()} edges")

return G_train, train_edges, test_edges


def temporal_split(G, time_edges, train_ratio=0.8, timestamp_attr='timestamp'):
"""
按时间顺序划分训练/测试集
time_edges: [(u, v, timestamp), ...]
"""
sorted_edges = sorted(time_edges, key=lambda x: x[2])
n_train = int(len(sorted_edges) * train_ratio)

train_edges = [(u, v) for u, v, _ in sorted_edges[:n_train]]
test_edges = [(u, v) for u, v, _ in sorted_edges[n_train:]]

G_train = nx.Graph()
G_train.add_edges_from(train_edges)

return G_train, train_edges, test_edges

链接预测指标实现

def common_neighbors(G, u, v):
"""Common Neighbors (CN)"""
return len(set(G.neighbors(u)) & set(G.neighbors(v)))


def jaccard_coefficient(G, u, v):
"""Jaccard Coefficient (JC)"""
intersection = len(set(G.neighbors(u)) & set(G.neighbors(v)))
union = len(set(G.neighbors(u)) | set(G.neighbors(v)))
return intersection / union if union > 0 else 0


def adamic_adar_index(G, u, v):
"""Adamic-Adar Index (AA)"""
neighbors = set(G.neighbors(u)) & set(G.neighbors(v))
score = 0
for w in neighbors:
k_w = G.degree(w)
if k_w > 1:
score += 1 / np.log(k_w)
return score


def preferential_attachment(G, u, v):
"""Preferential Attachment (PA)"""
return G.degree(u) * G.degree(v)


def resource_allocation(G, u, v):
"""Resource Allocation (RA)"""
neighbors = set(G.neighbors(u)) & set(G.neighbors(v))
score = 0
for w in neighbors:
k_w = G.degree(w)
if k_w > 0:
score += 1 / k_w
return score


def compute_all_scores(G, u, v):
"""
计算所有指标的综合得分向量
"""
return {
'CN': common_neighbors(G, u, v),
'JC': jaccard_coefficient(G, u, v),
'AA': adamic_adar_index(G, u, v),
'PA': preferential_attachment(G, u, v),
'RA': resource_allocation(G, u, v)
}

生成候选边

def generate_candidates(G_train, test_edges, non_edge_sample_ratio=10):
"""
生成候选边集合
- 正样本:测试集中的边(实际存在)
- 负样本:测试集中不存在的边(随机采样)

返回: candidates = [(u, v, label, scores), ...]
"""
# 构建训练网络中所有不存在的边
all_possible = []
nodes = list(G_train.nodes())
n_nodes = len(nodes)

print("Generating non-edges...")
# O(n²) 操作,对大规模网络需优化
for i in range(n_nodes):
for j in range(i + 1, n_nodes):
u, v = nodes[i], nodes[j]
if not G_train.has_edge(u, v):
all_possible.append((u, v))

# 随机采样负样本
n_neg = min(len(test_edges) * non_edge_sample_ratio, len(all_possible))
negative_samples = random.sample(all_possible, n_neg)

# 构建候选集
test_edge_set = set(test_edges)
candidates = []

# 正样本
for u, v in test_edges:
if G_train.has_node(u) and G_train.has_node(v):
candidates.append((u, v, 1)) # label=1: 实际存在

# 负样本
for u, v in negative_samples:
candidates.append((u, v, 0)) # label=0: 不存在

random.shuffle(candidates)

print(f"Candidates: {len(candidates)} (pos={sum(c[2] for c in candidates)}, neg={len(candidates) - sum(c[2] for c in candidates))})")

return candidates


def compute_scores_for_candidates(G_train, candidates):
"""
为所有候选边计算预测得分
"""
print("Computing scores for all candidates...")
results = []

for i, (u, v, label) in enumerate(candidates):
if i % 5000 == 0 and i > 0:
print(f" Progress: {i}/{len(candidates)}")

try:
scores = compute_all_scores(G_train, u, v)
results.append({
'u': u, 'v': v, 'label': label,
'CN': scores['CN'],
'JC': scores['JC'],
'AA': scores['AA'],
'PA': scores['PA'],
'RA': scores['RA']
})
except Exception as e:
# 处理孤立节点等情况
results.append({
'u': u, 'v': v, 'label': label,
'CN': 0, 'JC': 0, 'AA': 0, 'PA': 0, 'RA': 0
})

return pd.DataFrame(results)

评估函数

def evaluate_predictor(df, score_column):
"""
评估单一预测指标的 AUC 和 Precision@K
"""
y_true = df['label'].values
y_score = df[score_column].values

# AUC-ROC
fpr, tpr, _ = roc_curve(y_true, y_score)
roc_auc = auc(fpr, tpr)

# Precision@K (K = number of positive samples in test set)
n_pos = sum(y_true)
sorted_indices = np.argsort(y_score)[::-1] # 降序排列
top_k = sorted_indices[:n_pos]
precision_at_k = sum(y_true[idx] for idx in top_k) / n_pos

# Precision-Recall Curve
prec, rec, _ = precision_recall_curve(y_true, y_score)
pr_auc = auc(rec, prec)

return {
'AUC': roc_auc,
'Precision@K': precision_at_k,
'PR-AUC': pr_auc,
'fpr': fpr, 'tpr': tpr,
'precision': prec, 'recall': rec
}


def evaluate_all(df, methods):
"""
评估所有方法的性能
"""
results = {}
for method in methods:
results[method] = evaluate_predictor(df, method)
print(f" {method:8s}: AUC={results[method]['AUC']:.4f}, "
f"P@K={results[method]['Precision@K']:.4f}, "
f"PR-AUC={results[method]['PR-AUC']:.4f}")
return results


def plot_roc_curves(results, methods, save_path=None):
"""
绘制 ROC 曲线对比
"""
plt.figure(figsize=(8, 6))

colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']

for i, method in enumerate(methods):
r = results[method]
plt.plot(r['fpr'], r['tpr'],
color=colors[i % len(colors)],
label=f"{method} (AUC={r['AUC']:.3f})",
linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random')
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Link Prediction Comparison', fontsize=14)
plt.legend(loc='lower right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()

if save_path:
plt.savefig(save_path, dpi=300)
print(f"Saved ROC curves to {save_path}")

plt.show()


def plot_pr_curves(results, methods, save_path=None):
"""
绘制 Precision-Recall 曲线对比
"""
plt.figure(figsize=(8, 6))

colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']

for i, method in enumerate(methods):
r = results[method]
plt.plot(r['recall'], r['precision'],
color=colors[i % len(colors)],
label=f"{method} (PR-AUC={r['PR-AUC']:.3f})",
linewidth=2)

plt.xlabel('Recall', fontsize=12)
plt.ylabel('Precision', fontsize=12)
plt.title('Precision-Recall Curves - Link Prediction', fontsize=14)
plt.legend(loc='upper right', fontsize=11)
plt.grid(True, alpha=0.3)
plt.tight_layout()

if save_path:
plt.savefig(save_path, dpi=300)
print(f"Saved PR curves to {save_path}")

plt.show()


def plot_score_distribution(df, method, save_path=None):
"""
绘制得分分布(正负样本分开)
"""
pos_scores = df[df['label'] == 1][method].values
neg_scores = df[df['label'] == 0][method].values

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.hist(pos_scores, bins=50, alpha=0.7, label='Positive (exist)', color='#2ecc71')
plt.hist(neg_scores, bins=50, alpha=0.7, label='Negative (non-exist)', color='#e74c3c')
plt.xlabel(f'{method} Score', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title(f'{method}: Score Distribution', fontsize=14)
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
# 箱线图
data = [pos_scores, neg_scores]
bp = plt.boxplot(data, labels=['Positive', 'Negative'], patch_artist=True)
bp['boxes'][0].set_facecolor('#2ecc71')
bp['boxes'][1].set_facecolor('#e74c3c')
plt.ylabel(f'{method} Score', fontsize=12)
plt.title(f'{method}: Score Boxplot', fontsize=14)
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()

if save_path:
plt.savefig(save_path, dpi=300)

plt.show()

Katz 指标(更复杂的算法)

def katz_index(G, beta=0.01, max_iter=100, tol=1e-6):
"""
Katz 指标:考虑路径长度的加权求和

Katz(u,v) = Σ_{l=1}^{∞} β^l * (l-path count between u and v)

简化版本:基于邻接矩阵的幂迭代
"""
A = nx.adjacency_matrix(G).todense()
n = A.shape[0]
I = np.eye(n)

# 计算 Katz 得分矩阵:S = (I - βA)^{-1} - I
try:
S = np.linalg.inv(I - beta * A) - I
return S
except np.linalg.LinAlgError:
# 奇异矩阵,使用迭代近似
print("Using iterative Katz approximation...")
S = np.zeros((n, n))
A_power = A.copy().astype(float)

for l in range(1, max_iter + 1):
A_power = A_power @ A
S += (beta ** l) * A_power
if np.sum(A_power) < tol:
break

return S


def katz_score(G, u, v, beta=0.01, node_to_idx=None, idx_to_node=None):
"""
计算特定节点对的 Katz 得分
"""
if node_to_idx is None:
nodes = list(G.nodes())
node_to_idx = {node: i for i, node in enumerate(nodes)}
idx_to_node = {i: node for i, node in enumerate(nodes)}

if u not in node_to_idx or v not in node_to_idx:
return 0

i, j = node_to_idx[u], node_to_idx[v]

# 迭代计算(适合稀疏网络)
score = 0
neighbors_u = set(G.neighbors(u))

if v in neighbors_u:
score += beta # 直接邻居,路径长度=1

# BFS 找短路径
visited = {u: 0}
queue = [(u, 0)]

while queue:
current, depth = queue.pop(0)
if depth >= max_iter:
continue

for neighbor in G.neighbors(current):
if neighbor not in visited or visited[neighbor] > depth + 1:
visited[neighbor] = depth + 1
path_len = depth + 1

if neighbor == v:
score += beta ** path_len
else:
queue.append((neighbor, depth + 1))

return score

局部路径索引(考虑更多跳)

def local_path_index(G, u, v, epsilon=0.01):
"""
局部路径索引:结合二跳和三跳路径

LP(u,v) = A²[u,v] + ε * A³[u,v]

其中 A 是邻接矩阵
"""
neighbors_u = set(G.neighbors(u))
neighbors_v = set(G.neighbors(v))

# 二跳路径数 (Common Neighbors 就是 CN = A²)
two_hop = len(neighbors_u & neighbors_v)

# 三跳路径数
three_hop = 0
for w in neighbors_u:
three_hop += len(set(G.neighbors(w)) & neighbors_v)

return two_hop + epsilon * three_hop


def compute_all_local_path_scores(G, candidates, epsilon=0.01):
"""
为所有候选边计算局部路径得分
"""
results = []
for u, v, label in candidates:
lp_score = local_path_index(G, u, v, epsilon)
results.append({
'u': u, 'v': v, 'label': label,
'LP': lp_score
})
return results

完整实验流程

def run_link_prediction_experiment(network_path, test_ratio=0.2):
"""
完整的链接预测实验流程
"""
print("=" * 60)
print("COMP5313 Assignment 2 - Task 1: Link Prediction")
print("=" * 60)

# 1. 加载数据
print("\n[1] Loading network...")
G = load_network(network_path)

# 2. 划分训练/测试集
print("\n[2] Splitting into train/test sets...")
G_train, train_edges, test_edges = train_test_split(G, test_ratio=test_ratio)

# 3. 生成候选边
print("\n[3] Generating candidate edges...")
candidates = generate_candidates(G_train, test_edges, non_edge_sample_ratio=10)

# 4. 计算预测得分
print("\n[4] Computing prediction scores...")
start_time = time.time()
df = compute_scores_for_candidates(G_train, candidates)
print(f" Time: {time.time() - start_time:.2f}s")

# 5. 评估所有方法
print("\n[5] Evaluating methods...")
methods = ['CN', 'JC', 'AA', 'PA', 'RA']
results = evaluate_all(df, methods)

# 6. 绘制对比图
print("\n[6] Generating visualizations...")
plot_roc_curves(results, methods, save_path='roc_curves.png')
plot_pr_curves(results, methods, save_path='pr_curves.png')
plot_score_distribution(df, 'AA', save_path='aa_distribution.png')

# 7. 结果汇总
print("\n" + "=" * 60)
print("RESULTS SUMMARY")
print("=" * 60)

summary = []
for method in methods:
r = results[method]
summary.append({
'Method': method,
'AUC': round(r['AUC'], 4),
'Precision@K': round(r['Precision@K'], 4),
'PR-AUC': round(r['PR-AUC'], 4)
})

summary_df = pd.DataFrame(summary)
print(summary_df.to_string(index=False))

# 8. 保存结果
print("\n[7] Saving results...")
df.to_csv('link_prediction_scores.csv', index=False)
summary_df.to_csv('link_prediction_summary.csv', index=False)

# 保存详细报告
with open('task1_report.txt', 'w') as f:
f.write("COMP5313 Assignment 2 - Task 1 Report\n")
f.write("=" * 50 + "\n\n")
f.write(f"Network: {network_path}\n")
f.write(f"Original edges: {G.number_of_edges()}\n")
f.write(f"Train edges: {len(train_edges)}\n")
f.write(f"Test edges: {len(test_edges)}\n")
f.write(f"Test ratio: {test_ratio}\n\n")
f.write("Results:\n")
f.write(summary_df.to_string(index=False))

print("\nFiles saved:")
print(" - roc_curves.png")
print(" - pr_curves.png")
print(" - aa_distribution.png")
print(" - link_prediction_scores.csv")
print(" - link_prediction_summary.csv")
print(" - task1_report.txt")

return results, df


# 运行实验
if __name__ == '__main__':
# 示例用法
# results, df = run_link_prediction_experiment('network.txt')
pass

结果分析与讨论

典型结果示例

假设在 Zachary's Karate Club 网络上的典型结果:

Method AUC Precision@K PR-AUC Notes
AA 0.954 0.892 0.821 最佳综合表现
RA 0.951 0.885 0.815 与 AA 接近
JC 0.948 0.876 0.808 对高度数节点友好
CN 0.932 0.845 0.775 基础指标
PA 0.712 0.523 0.498 对无标度网络有效

分析讨论点

  1. 为什么 AA/RA 通常表现最好?
    • 对高度数节点给予较低权重
    • 考虑了节点的信息量差异
    • 在稀疏网络上也能给出有意义的分数
  2. PA 为什么表现较差?
    • 只考虑度的乘积,忽略拓扑结构
    • 对均匀度网络效果差
    • 但在 BA 模型生成的网络上可能表现较好
  3. 什么情况下 CN 足够?
    • 网络规模很大,计算资源有限
    • 网络度分布相对均匀
    • 需要快速基线时
  4. 如何进一步提升?
    • 使用全局特征(Katz, PageRank, SimRank)
    • 加入节点属性(如果有)
    • 集成学习方法

常见问题与优化

Q1: 网络太大,O(n²) 候选边生成太慢?

解决方案: 1. 只考虑共同邻居 > 0 的节点对(预过滤) 2. 使用稀疏矩阵操作 3. 采样而非枚举所有候选边

def generate_candidates_fast(G_train, test_edges, n_samples=10000):
"""快速候选边生成:只采样部分非边"""
test_edge_set = set(test_edges)

# 正样本
pos_samples = [(u, v, 1) for u, v in test_edges
if u in G_train and v in G_train]

# 负样本:随机采样
nodes = list(G_train.nodes())
neg_samples = []
while len(neg_samples) < len(pos_samples) * 10:
u, v = random.sample(nodes, 2)
if not G_train.has_edge(u, v) and (u, v) not in test_edge_set:
neg_samples.append((u, v, 0))

return pos_samples + neg_samples

Q2: 不同 test_ratio 对结果影响大吗?

实验设计: 固定网络,改变 test_ratio 观察结果稳定性。

def sensitivity_analysis(G, test_ratios=[0.1, 0.2, 0.3, 0.4]):
"""
分析 test_ratio 对 AUC 的影响
"""
results = []
for ratio in test_ratios:
G_train, train_edges, test_edges = train_test_split(G, test_ratio=ratio)
candidates = generate_candidates(G_train, test_edges)
df = compute_scores_for_candidates(G_train, candidates)

for method in ['AA', 'RA', 'JC', 'CN']:
r = evaluate_predictor(df, method)
results.append({
'test_ratio': ratio,
'method': method,
'AUC': r['AUC']
})

# 绘制敏感性曲线
import pandas as pd
df_results = pd.DataFrame(results)

plt.figure(figsize=(10, 6))
for method in ['AA', 'RA', 'JC', 'CN']:
subset = df_results[df_results['method'] == method]
plt.plot(subset['test_ratio'], subset['AUC'], 'o-', label=method)

plt.xlabel('Test Ratio')
plt.ylabel('AUC')
plt.title('Sensitivity Analysis: Test Ratio vs AUC')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('sensitivity_analysis.png', dpi=300)
plt.show()

Q3: 如何处理有向图?

转换方法: 1. 忽略方向,当无向图处理 2. 使用有向版本的指标(考虑入度/出度) 3. 分别预测入边和出边

def adamic_adar_directed(G, u, v):
"""有向图版本的 Adamic-Adar"""
# 出边邻居
out_u = set(G.successors(u)) if G.is_directed() else set(G.neighbors(u))
# 入边邻居
in_v = set(G.predecessors(v)) if G.is_directed() else set(G.neighbors(v))

common = out_u & in_v
score = sum(1 / np.log(G.degree(w) + 1) for w in common if G.degree(w) > 1)
return score

评分要点

评分项 权重 检查点
算法实现 30% CN, JC, AA, RA, PA 正确实现
实验设计 25% 合理的 train/test 划分,足够样本量
结果分析 25% AUC/PR 曲线,统计显著性
报告质量 20% 图表清晰,解释充分,讨论深入

加分项: - 实现 Katz、PageRank 等全局指标 - 分析不同 test_ratio 的敏感性 - 处理有向图或加权图 - 使用集成学习提升预测效果


参考资源

  1. Liben-Nowell & Kleinberg (2007): "The Link Prediction Problem for Social Networks"
  2. Zhou et al. (2009): "Predicting missing links via local information"
  3. NetworkX Link Prediction: nx.link_prediction 模块

Last Updated: 2026-04-19
Status: ✅ Complete