COMP5313 Assignment 2 - Task 1 题解

Posted on 2026-04-19 Edited on 2026-04-18 In Course , COMP5313 Word count in article: 3.3k Reading time ≈ 12 mins.

Assignment 2 Task 1 链接预测算法实现与评估

Assignment 2 - Task 1：Link Prediction

题目理解

Task 1 要求在给定网络中实现并评估多种链接预测算法，预测未来可能出现的边。

核心目标： 1. 实现多种基于节点相似度的链接预测算法 2. 在训练/测试集上评估预测性能 3. 比较不同算法的准确率、精确率、召回率等指标 4. 分析网络结构特征对预测效果的影响

关键概念回顾

链接预测的定义

给定网络 \(G = (V, E)\)，链接预测问题旨在预测未来可能出现的边 \((u, v) \notin E\)。

两种评估方式： 1. 时序分割：用时间 \(t\) 前的边训练，\(t\) 后的边测试 2. 随机分割：随机划分边为训练集和测试集

基于节点相似度的指标

设 \(\Gamma(u)\) 为节点 \(u\) 的邻居集合。

1. Common Neighbors (CN)

\[CN(u, v) = |\Gamma(u) \cap \Gamma(v)|\]

最简单的指标：共同邻居越多，越可能相连。

2. Jaccard Coefficient (JC)

\[JC(u, v) = \frac{|\Gamma(u) \cap \Gamma(v)|}{|\Gamma(u) \cup \Gamma(v)|}\]

归一化后的共同邻居，消除度的影响。

3. Adamic-Adar Index (AA)

\[AA(u, v) = \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{\log k_w}\]

对高度数节点惩罚（高度数节点的信息量较少）。

4. Preferential Attachment (PA)

\[PA(u, v) = k_u \times k_v\]

度越高的节点越容易获得新连接（富者愈富）。

5. Resource Allocation (RA)

\[RA(u, v) = \sum_{w \in \Gamma(u) \cap \Gamma(v)} \frac{1}{k_w}\]

与 AA 类似但惩罚更强（\(1/k_w\) vs \(1/\log k_w\)）。

评估指标

指标	定义	适用场景
AUC	随机未连接边得分 < 随机连接边得分的概率	整体排序能力
Precision@K	前 K 个预测中正确的比例	Top-K 应用
Recall	正确预测数 / 总实际新增边数	覆盖率
F1-Score	2 × Precision × Recall / (Precision + Recall)	综合性能

Python 实现

基础设置与数据加载

import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import auc, precision_recall_curve, roc_curve
from collections import Counter
import random
import time

# 设置随机种子保证可复现性
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)


def load_network(filepath):
    """
    加载网络数据
    支持格式：边列表 (每行 "u v")
    """
    G = nx.read_edgelist(filepath, nodetype=int, create_using=nx.Graph())
    print(f"Loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
    return G


def train_test_split(G, test_ratio=0.2, seed=42):
    """
    随机划分训练集和测试集
    """
    random.seed(seed)
    
    # 获取所有边
    all_edges = list(G.edges())
    random.shuffle(all_edges)
    
    # 划分点
    n_test = int(len(all_edges) * test_ratio)
    test_edges = all_edges[:n_test]
    train_edges = all_edges[n_test:]
    
    # 构建训练网络
    G_train = G.copy()
    G_train.remove_edges_from(test_edges)
    
    print(f"Train edges: {len(train_edges)}, Test edges: {len(test_edges)}")
    print(f"Train network: {G_train.number_of_nodes()} nodes, {G_train.number_of_edges()} edges")
    
    return G_train, train_edges, test_edges


def temporal_split(G, time_edges, train_ratio=0.8, timestamp_attr='timestamp'):
    """
    按时间顺序划分训练/测试集
    time_edges: [(u, v, timestamp), ...]
    """
    sorted_edges = sorted(time_edges, key=lambda x: x[2])
    n_train = int(len(sorted_edges) * train_ratio)
    
    train_edges = [(u, v) for u, v, _ in sorted_edges[:n_train]]
    test_edges = [(u, v) for u, v, _ in sorted_edges[n_train:]]
    
    G_train = nx.Graph()
    G_train.add_edges_from(train_edges)
    
    return G_train, train_edges, test_edges

链接预测指标实现

def common_neighbors(G, u, v):
    """Common Neighbors (CN)"""
    return len(set(G.neighbors(u)) & set(G.neighbors(v)))


def jaccard_coefficient(G, u, v):
    """Jaccard Coefficient (JC)"""
    intersection = len(set(G.neighbors(u)) & set(G.neighbors(v)))
    union = len(set(G.neighbors(u)) | set(G.neighbors(v)))
    return intersection / union if union > 0 else 0


def adamic_adar_index(G, u, v):
    """Adamic-Adar Index (AA)"""
    neighbors = set(G.neighbors(u)) & set(G.neighbors(v))
    score = 0
    for w in neighbors:
        k_w = G.degree(w)
        if k_w > 1:
            score += 1 / np.log(k_w)
    return score


def preferential_attachment(G, u, v):
    """Preferential Attachment (PA)"""
    return G.degree(u) * G.degree(v)


def resource_allocation(G, u, v):
    """Resource Allocation (RA)"""
    neighbors = set(G.neighbors(u)) & set(G.neighbors(v))
    score = 0
    for w in neighbors:
        k_w = G.degree(w)
        if k_w > 0:
            score += 1 / k_w
    return score


def compute_all_scores(G, u, v):
    """
    计算所有指标的综合得分向量
    """
    return {
        'CN': common_neighbors(G, u, v),
        'JC': jaccard_coefficient(G, u, v),
        'AA': adamic_adar_index(G, u, v),
        'PA': preferential_attachment(G, u, v),
        'RA': resource_allocation(G, u, v)
    }

生成候选边

def generate_candidates(G_train, test_edges, non_edge_sample_ratio=10):
    """
    生成候选边集合
    - 正样本：测试集中的边（实际存在）
    - 负样本：测试集中不存在的边（随机采样）
    
    返回: candidates = [(u, v, label, scores), ...]
    """
    # 构建训练网络中所有不存在的边
    all_possible = []
    nodes = list(G_train.nodes())
    n_nodes = len(nodes)
    
    print("Generating non-edges...")
    # O(n²) 操作，对大规模网络需优化
    for i in range(n_nodes):
        for j in range(i + 1, n_nodes):
            u, v = nodes[i], nodes[j]
            if not G_train.has_edge(u, v):
                all_possible.append((u, v))
    
    # 随机采样负样本
    n_neg = min(len(test_edges) * non_edge_sample_ratio, len(all_possible))
    negative_samples = random.sample(all_possible, n_neg)
    
    # 构建候选集
    test_edge_set = set(test_edges)
    candidates = []
    
    # 正样本
    for u, v in test_edges:
        if G_train.has_node(u) and G_train.has_node(v):
            candidates.append((u, v, 1))  # label=1: 实际存在
    
    # 负样本
    for u, v in negative_samples:
        candidates.append((u, v, 0))  # label=0: 不存在
    
    random.shuffle(candidates)
    
    print(f"Candidates: {len(candidates)} (pos={sum(c[2] for c in candidates)}, neg={len(candidates) - sum(c[2] for c in candidates))})")
    
    return candidates


def compute_scores_for_candidates(G_train, candidates):
    """
    为所有候选边计算预测得分
    """
    print("Computing scores for all candidates...")
    results = []
    
    for i, (u, v, label) in enumerate(candidates):
        if i % 5000 == 0 and i > 0:
            print(f"  Progress: {i}/{len(candidates)}")
        
        try:
            scores = compute_all_scores(G_train, u, v)
            results.append({
                'u': u, 'v': v, 'label': label,
                'CN': scores['CN'],
                'JC': scores['JC'],
                'AA': scores['AA'],
                'PA': scores['PA'],
                'RA': scores['RA']
            })
        except Exception as e:
            # 处理孤立节点等情况
            results.append({
                'u': u, 'v': v, 'label': label,
                'CN': 0, 'JC': 0, 'AA': 0, 'PA': 0, 'RA': 0
            })
    
    return pd.DataFrame(results)

评估函数

def evaluate_predictor(df, score_column):
    """
    评估单一预测指标的 AUC 和 Precision@K
    """
    y_true = df['label'].values
    y_score = df[score_column].values
    
    # AUC-ROC
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    
    # Precision@K (K = number of positive samples in test set)
    n_pos = sum(y_true)
    sorted_indices = np.argsort(y_score)[::-1]  # 降序排列
    top_k = sorted_indices[:n_pos]
    precision_at_k = sum(y_true[idx] for idx in top_k) / n_pos
    
    # Precision-Recall Curve
    prec, rec, _ = precision_recall_curve(y_true, y_score)
    pr_auc = auc(rec, prec)
    
    return {
        'AUC': roc_auc,
        'Precision@K': precision_at_k,
        'PR-AUC': pr_auc,
        'fpr': fpr, 'tpr': tpr,
        'precision': prec, 'recall': rec
    }


def evaluate_all(df, methods):
    """
    评估所有方法的性能
    """
    results = {}
    for method in methods:
        results[method] = evaluate_predictor(df, method)
        print(f"  {method:8s}: AUC={results[method]['AUC']:.4f}, "
              f"P@K={results[method]['Precision@K']:.4f}, "
              f"PR-AUC={results[method]['PR-AUC']:.4f}")
    return results


def plot_roc_curves(results, methods, save_path=None):
    """
    绘制 ROC 曲线对比
    """
    plt.figure(figsize=(8, 6))
    
    colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']
    
    for i, method in enumerate(methods):
        r = results[method]
        plt.plot(r['fpr'], r['tpr'], 
                color=colors[i % len(colors)],
                label=f"{method} (AUC={r['AUC']:.3f})",
                linewidth=2)
    
    plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Random')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves - Link Prediction Comparison', fontsize=14)
    plt.legend(loc='lower right', fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
        print(f"Saved ROC curves to {save_path}")
    
    plt.show()


def plot_pr_curves(results, methods, save_path=None):
    """
    绘制 Precision-Recall 曲线对比
    """
    plt.figure(figsize=(8, 6))
    
    colors = ['#3498db', '#2ecc71', '#e74c3c', '#9b59b6', '#f39c12']
    
    for i, method in enumerate(methods):
        r = results[method]
        plt.plot(r['recall'], r['precision'],
                color=colors[i % len(colors)],
                label=f"{method} (PR-AUC={r['PR-AUC']:.3f})",
                linewidth=2)
    
    plt.xlabel('Recall', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title('Precision-Recall Curves - Link Prediction', fontsize=14)
    plt.legend(loc='upper right', fontsize=11)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
        print(f"Saved PR curves to {save_path}")
    
    plt.show()


def plot_score_distribution(df, method, save_path=None):
    """
    绘制得分分布（正负样本分开）
    """
    pos_scores = df[df['label'] == 1][method].values
    neg_scores = df[df['label'] == 0][method].values
    
    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(pos_scores, bins=50, alpha=0.7, label='Positive (exist)', color='#2ecc71')
    plt.hist(neg_scores, bins=50, alpha=0.7, label='Negative (non-exist)', color='#e74c3c')
    plt.xlabel(f'{method} Score', fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.title(f'{method}: Score Distribution', fontsize=14)
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.subplot(1, 2, 2)
    # 箱线图
    data = [pos_scores, neg_scores]
    bp = plt.boxplot(data, labels=['Positive', 'Negative'], patch_artist=True)
    bp['boxes'][0].set_facecolor('#2ecc71')
    bp['boxes'][1].set_facecolor('#e74c3c')
    plt.ylabel(f'{method} Score', fontsize=12)
    plt.title(f'{method}: Score Boxplot', fontsize=14)
    plt.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, dpi=300)
    
    plt.show()

Katz 指标（更复杂的算法）

def katz_index(G, beta=0.01, max_iter=100, tol=1e-6):
    """
    Katz 指标：考虑路径长度的加权求和
    
    Katz(u,v) = Σ_{l=1}^{∞} β^l * (l-path count between u and v)
    
    简化版本：基于邻接矩阵的幂迭代
    """
    A = nx.adjacency_matrix(G).todense()
    n = A.shape[0]
    I = np.eye(n)
    
    # 计算 Katz 得分矩阵：S = (I - βA)^{-1} - I
    try:
        S = np.linalg.inv(I - beta * A) - I
        return S
    except np.linalg.LinAlgError:
        # 奇异矩阵，使用迭代近似
        print("Using iterative Katz approximation...")
        S = np.zeros((n, n))
        A_power = A.copy().astype(float)
        
        for l in range(1, max_iter + 1):
            A_power = A_power @ A
            S += (beta ** l) * A_power
            if np.sum(A_power) < tol:
                break
        
        return S


def katz_score(G, u, v, beta=0.01, node_to_idx=None, idx_to_node=None):
    """
    计算特定节点对的 Katz 得分
    """
    if node_to_idx is None:
        nodes = list(G.nodes())
        node_to_idx = {node: i for i, node in enumerate(nodes)}
        idx_to_node = {i: node for i, node in enumerate(nodes)}
    
    if u not in node_to_idx or v not in node_to_idx:
        return 0
    
    i, j = node_to_idx[u], node_to_idx[v]
    
    # 迭代计算（适合稀疏网络）
    score = 0
    neighbors_u = set(G.neighbors(u))
    
    if v in neighbors_u:
        score += beta  # 直接邻居，路径长度=1
    
    # BFS 找短路径
    visited = {u: 0}
    queue = [(u, 0)]
    
    while queue:
        current, depth = queue.pop(0)
        if depth >= max_iter:
            continue
        
        for neighbor in G.neighbors(current):
            if neighbor not in visited or visited[neighbor] > depth + 1:
                visited[neighbor] = depth + 1
                path_len = depth + 1
                
                if neighbor == v:
                    score += beta ** path_len
                else:
                    queue.append((neighbor, depth + 1))
    
    return score

局部路径索引（考虑更多跳）

def local_path_index(G, u, v, epsilon=0.01):
    """
    局部路径索引：结合二跳和三跳路径
    
    LP(u,v) = A²[u,v] + ε * A³[u,v]
    
    其中 A 是邻接矩阵
    """
    neighbors_u = set(G.neighbors(u))
    neighbors_v = set(G.neighbors(v))
    
    # 二跳路径数 (Common Neighbors 就是 CN = A²)
    two_hop = len(neighbors_u & neighbors_v)
    
    # 三跳路径数
    three_hop = 0
    for w in neighbors_u:
        three_hop += len(set(G.neighbors(w)) & neighbors_v)
    
    return two_hop + epsilon * three_hop


def compute_all_local_path_scores(G, candidates, epsilon=0.01):
    """
    为所有候选边计算局部路径得分
    """
    results = []
    for u, v, label in candidates:
        lp_score = local_path_index(G, u, v, epsilon)
        results.append({
            'u': u, 'v': v, 'label': label,
            'LP': lp_score
        })
    return results

完整实验流程

def run_link_prediction_experiment(network_path, test_ratio=0.2):
    """
    完整的链接预测实验流程
    """
    print("=" * 60)
    print("COMP5313 Assignment 2 - Task 1: Link Prediction")
    print("=" * 60)
    
    # 1. 加载数据
    print("\n[1] Loading network...")
    G = load_network(network_path)
    
    # 2. 划分训练/测试集
    print("\n[2] Splitting into train/test sets...")
    G_train, train_edges, test_edges = train_test_split(G, test_ratio=test_ratio)
    
    # 3. 生成候选边
    print("\n[3] Generating candidate edges...")
    candidates = generate_candidates(G_train, test_edges, non_edge_sample_ratio=10)
    
    # 4. 计算预测得分
    print("\n[4] Computing prediction scores...")
    start_time = time.time()
    df = compute_scores_for_candidates(G_train, candidates)
    print(f"  Time: {time.time() - start_time:.2f}s")
    
    # 5. 评估所有方法
    print("\n[5] Evaluating methods...")
    methods = ['CN', 'JC', 'AA', 'PA', 'RA']
    results = evaluate_all(df, methods)
    
    # 6. 绘制对比图
    print("\n[6] Generating visualizations...")
    plot_roc_curves(results, methods, save_path='roc_curves.png')
    plot_pr_curves(results, methods, save_path='pr_curves.png')
    plot_score_distribution(df, 'AA', save_path='aa_distribution.png')
    
    # 7. 结果汇总
    print("\n" + "=" * 60)
    print("RESULTS SUMMARY")
    print("=" * 60)
    
    summary = []
    for method in methods:
        r = results[method]
        summary.append({
            'Method': method,
            'AUC': round(r['AUC'], 4),
            'Precision@K': round(r['Precision@K'], 4),
            'PR-AUC': round(r['PR-AUC'], 4)
        })
    
    summary_df = pd.DataFrame(summary)
    print(summary_df.to_string(index=False))
    
    # 8. 保存结果
    print("\n[7] Saving results...")
    df.to_csv('link_prediction_scores.csv', index=False)
    summary_df.to_csv('link_prediction_summary.csv', index=False)
    
    # 保存详细报告
    with open('task1_report.txt', 'w') as f:
        f.write("COMP5313 Assignment 2 - Task 1 Report\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Network: {network_path}\n")
        f.write(f"Original edges: {G.number_of_edges()}\n")
        f.write(f"Train edges: {len(train_edges)}\n")
        f.write(f"Test edges: {len(test_edges)}\n")
        f.write(f"Test ratio: {test_ratio}\n\n")
        f.write("Results:\n")
        f.write(summary_df.to_string(index=False))
    
    print("\nFiles saved:")
    print("  - roc_curves.png")
    print("  - pr_curves.png")
    print("  - aa_distribution.png")
    print("  - link_prediction_scores.csv")
    print("  - link_prediction_summary.csv")
    print("  - task1_report.txt")
    
    return results, df


# 运行实验
if __name__ == '__main__':
    # 示例用法
    # results, df = run_link_prediction_experiment('network.txt')
    pass

结果分析与讨论

典型结果示例

假设在 Zachary's Karate Club 网络上的典型结果：

Method	AUC	Precision@K	PR-AUC	Notes
AA	0.954	0.892	0.821	最佳综合表现
RA	0.951	0.885	0.815	与 AA 接近
JC	0.948	0.876	0.808	对高度数节点友好
CN	0.932	0.845	0.775	基础指标
PA	0.712	0.523	0.498	对无标度网络有效

分析讨论点

为什么 AA/RA 通常表现最好？
- 对高度数节点给予较低权重
- 考虑了节点的信息量差异
- 在稀疏网络上也能给出有意义的分数
PA 为什么表现较差？
- 只考虑度的乘积，忽略拓扑结构
- 对均匀度网络效果差
- 但在 BA 模型生成的网络上可能表现较好
什么情况下 CN 足够？
- 网络规模很大，计算资源有限
- 网络度分布相对均匀
- 需要快速基线时
如何进一步提升？
- 使用全局特征（Katz, PageRank, SimRank）
- 加入节点属性（如果有）
- 集成学习方法

常见问题与优化

Q1: 网络太大，O(n²) 候选边生成太慢？

解决方案： 1. 只考虑共同邻居 > 0 的节点对（预过滤） 2. 使用稀疏矩阵操作 3. 采样而非枚举所有候选边

def generate_candidates_fast(G_train, test_edges, n_samples=10000):
    """快速候选边生成：只采样部分非边"""
    test_edge_set = set(test_edges)
    
    # 正样本
    pos_samples = [(u, v, 1) for u, v in test_edges 
                   if u in G_train and v in G_train]
    
    # 负样本：随机采样
    nodes = list(G_train.nodes())
    neg_samples = []
    while len(neg_samples) < len(pos_samples) * 10:
        u, v = random.sample(nodes, 2)
        if not G_train.has_edge(u, v) and (u, v) not in test_edge_set:
            neg_samples.append((u, v, 0))
    
    return pos_samples + neg_samples

Q2: 不同 test_ratio 对结果影响大吗？

实验设计： 固定网络，改变 test_ratio 观察结果稳定性。

def sensitivity_analysis(G, test_ratios=[0.1, 0.2, 0.3, 0.4]):
    """
    分析 test_ratio 对 AUC 的影响
    """
    results = []
    for ratio in test_ratios:
        G_train, train_edges, test_edges = train_test_split(G, test_ratio=ratio)
        candidates = generate_candidates(G_train, test_edges)
        df = compute_scores_for_candidates(G_train, candidates)
        
        for method in ['AA', 'RA', 'JC', 'CN']:
            r = evaluate_predictor(df, method)
            results.append({
                'test_ratio': ratio,
                'method': method,
                'AUC': r['AUC']
            })
    
    # 绘制敏感性曲线
    import pandas as pd
    df_results = pd.DataFrame(results)
    
    plt.figure(figsize=(10, 6))
    for method in ['AA', 'RA', 'JC', 'CN']:
        subset = df_results[df_results['method'] == method]
        plt.plot(subset['test_ratio'], subset['AUC'], 'o-', label=method)
    
    plt.xlabel('Test Ratio')
    plt.ylabel('AUC')
    plt.title('Sensitivity Analysis: Test Ratio vs AUC')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.savefig('sensitivity_analysis.png', dpi=300)
    plt.show()

Q3: 如何处理有向图？

转换方法： 1. 忽略方向，当无向图处理 2. 使用有向版本的指标（考虑入度/出度） 3. 分别预测入边和出边

def adamic_adar_directed(G, u, v):
    """有向图版本的 Adamic-Adar"""
    # 出边邻居
    out_u = set(G.successors(u)) if G.is_directed() else set(G.neighbors(u))
    # 入边邻居
    in_v = set(G.predecessors(v)) if G.is_directed() else set(G.neighbors(v))
    
    common = out_u & in_v
    score = sum(1 / np.log(G.degree(w) + 1) for w in common if G.degree(w) > 1)
    return score

评分要点

评分项	权重	检查点
算法实现	30%	CN, JC, AA, RA, PA 正确实现
实验设计	25%	合理的 train/test 划分，足够样本量
结果分析	25%	AUC/PR 曲线，统计显著性
报告质量	20%	图表清晰，解释充分，讨论深入

加分项： - 实现 Katz、PageRank 等全局指标 - 分析不同 test_ratio 的敏感性 - 处理有向图或加权图 - 使用集成学习提升预测效果

参考资源

Liben-Nowell & Kleinberg (2007): "The Link Prediction Problem for Social Networks"
Zhou et al. (2009): "Predicting missing links via local information"
NetworkX Link Prediction: nx.link_prediction 模块

Last Updated: 2026-04-19
Status: ✅ Complete