Source code for PlanetAlign.algorithms.regal

from typing import List, Tuple, Union
import torch
import numpy as np
import torch.nn.functional as F
from torch_geometric.utils import to_dense_adj, degree
import time
import psutil
import os

from PlanetAlign.data import Dataset
from PlanetAlign.metrics import hits_ks_scores, mrr_score
from PlanetAlign.utils import get_anchor_pairs

from .base_model import BaseModel



[docs]
class REGAL(BaseModel):
    r"""Embedding-based method REGAL for unsupervised pairwise attributed network alignment.
    REGAL is proposed by the paper: "`REGAL: Representation Learning-based Graph Alignment. <https://dl.acm.org/doi/10.1145/3269206.3271788>`_"
    in CIKM 2018.

    Parameters
    ----------
    k : int, optional
        Hyperparameter for tuning the number of landmarks. Default is 10.
    num_layers : int, optional
        Number of layers for the neighborhood when generating structural embeddings. Default is 2.
    alpha : float, optional
        Hyperparameter for the decay factor of the structural embedding. Default is 0.01.
    gammastruc : float, optional
        Weight for the structural similarity. Default is 1.
    gammaattr : float, optional
        Weight for the attribute similarity. Default is 1.
    buckets : int, optional
        Number of buckets for the structural embedding learning. Default is 2.
    dtype : torch.dtype, optional
        Data type of the tensors, choose from torch.float32 or torch.float64. Default is torch.float32.
    """

    def __init__(self,
                 k: int = 10,
                 num_layers: int = 2,
                 alpha: float = 0.01,
                 gammastruc: float = 1,
                 gammaattr: float = 1,
                 buckets: int = 2,
                 dtype: torch.dtype = torch.float32):
        super(REGAL, self).__init__(dtype=dtype)

        assert isinstance(k, int) and k > 0, 'k must be a positive integer'        
        assert isinstance(num_layers, int) and num_layers > 0, 'Number of layers must be a positive integer'
        assert 0 < alpha < 1, 'Alpha must be in the range (0, 1)'
        assert gammastruc >= 0, 'Gamma for structural similarity must be non-negative'
        assert gammaattr >= 0, 'Gamma for attribute similarity must be non-negative'
        assert isinstance(buckets, int) and buckets > 1, 'Buckets must be an integer greater than 1'

        self.k = k
        self.num_layers = num_layers
        self.alpha = alpha
        self.gammastruc = gammastruc
        self.gammaattr = gammaattr
        self.buckets = buckets
    

[docs]
    def train(self,
              dataset: Dataset,
              gids: Union[Tuple[int, int], List[int]],
              use_attr: bool = True,
              save_log: bool = True,
              verbose: bool = True):
        """
        Parameters
        ----------
        dataset : Dataset
            The dataset containing the graphs to be aligned and the training/test data.
        gids : tuple[int, int] or list[int]
            The indices of the graphs in the dataset to be aligned.
        use_attr : bool, optional
            Whether to use node and edge attributes for alignment. Default is True.
        save_log : bool, optional
            Whether to save the log of the training process. Default is True.
        verbose : bool, optional
            Whether to print the progress during training. Default is True.
        """
        self.check_inputs(dataset, gids, plain_method=False, use_attr=use_attr, pairwise=True, supervised=False)
        gid1, gid2 = gids

        logger = self.init_training_logger(dataset, use_attr, additional_headers=['memory', 'infer_time'], save_log=save_log)
        process = psutil.Process(os.getpid())

        graph1, graph2 = dataset.pyg_graphs[gid1], dataset.pyg_graphs[gid2]
        anchor_links = get_anchor_pairs(dataset.train_data, gid1, gid2)
        test_pairs = get_anchor_pairs(dataset.test_data, gid1, gid2)

        t0 = time.time()
        emb1, emb2, loss = self.get_xnetmf_embedding(graph1, graph2, anchor_links, use_attr, verbose)
        t1 = time.time()

        # Evaluation
        S = emb1 @ emb2.T
        hits = hits_ks_scores(S, test_pairs, mode='mean')
        mrr = mrr_score(S, test_pairs, mode='mean')
        mem_gb = process.memory_info().rss / 1024 ** 3
        logger.log(epoch=1,
                   loss=loss,
                   epoch_time=t1-t0,
                   hits=hits,
                   mrr=mrr,
                   memory=round(mem_gb, 4),
                   infer_time=round(t1-t0, 4),
                   verbose=verbose)

        return emb1, emb2, logger


    def get_xnetmf_embedding(self, graph1, graph2, anchor_links, use_attr, verbose=True):
        degree1 = degree(graph1.edge_index[0], num_nodes=graph1.num_nodes)
        degree2 = degree(graph2.edge_index[0], num_nodes=graph2.num_nodes)
        max_degree = max(degree1.max().item(), degree2.max().item())

        if verbose:
            print('Generating structural embeddings...', end=' ')
        struct_emb1 = self.get_structural_embedding(graph1, max_degree)
        struct_emb2 = self.get_structural_embedding(graph2, max_degree)
        if verbose:
            print('Done')

        if verbose:
            print('Generating xNetMF embeddings...', end=' ')
        n1, n2 = graph1.num_nodes, graph2.num_nodes
        num_landmarks = min(int(self.k * np.log(n1 + n2) / np.log(2)), n1 + n2)
        sampled_landmarks = torch.randperm(n1 + n2)[:num_landmarks]

        # Merge embeddings of the two graphs for effective similarity computation
        struct_emb = torch.vstack([struct_emb1, struct_emb2])
        landmarks_struct_emb = struct_emb[sampled_landmarks]
        struct_dist = torch.norm(struct_emb[:, None, :] - landmarks_struct_emb[None, :, :], dim=2)

        num_anchors = anchor_links.shape[0]
        anchor_emb1 = torch.empty((n1, 0), dtype=self.dtype)
        anchor_emb2 = torch.empty((n2, 0), dtype=self.dtype)
        if num_anchors > 0:
            anchor_emb1 = torch.zeros((n1, num_anchors), dtype=self.dtype)
            anchor_emb2 = torch.zeros((n2, num_anchors), dtype=self.dtype)
            anchor_emb1[anchor_links[:, 0], torch.arange(num_anchors)] = 1
            anchor_emb2[anchor_links[:, 1], torch.arange(num_anchors)] = 1

        attr_emb1 = torch.empty((n1, 0), dtype=self.dtype)
        attr_emb2 = torch.empty((n2, 0), dtype=self.dtype)
        if use_attr and graph1.x is not None and graph2.x is not None:
            attr_emb1 = graph1.x
            attr_emb2 = graph2.x

        input_emb1 = torch.hstack([anchor_emb1, attr_emb1])
        input_emb2 = torch.hstack([anchor_emb2, attr_emb2])

        if input_emb1.shape[1] == 0 or input_emb2.shape[1] == 0:
            attribute_dist = 0
        else:
            input_emb = torch.vstack([input_emb1, input_emb2])
            landmarks_attr_emb = input_emb[sampled_landmarks]
            attribute_dist = (input_emb[:, None, :] != landmarks_attr_emb[None, :, :]).to(self.dtype).sum(dim=2)

        C = torch.exp(-(self.gammastruc * struct_dist + self.gammaattr * attribute_dist))

        W_pinv = torch.pinverse(C[sampled_landmarks])
        U, X, V = torch.svd(W_pinv)
        Wfac = U @ torch.diag(torch.sqrt(X))
        xnetmf_emb = C @ Wfac
        xnetmf_emb = F.normalize(xnetmf_emb, p=2, dim=1)
        if verbose:
            print('Done')

        W_reconstructed = U @ torch.diag(X) @ V.T
        svd_recon_loss = F.mse_loss(W_reconstructed, W_pinv)

        return xnetmf_emb[:n1], xnetmf_emb[n1:], svd_recon_loss.item()

    def get_structural_embedding(self, graph, max_degree):
        degrees = degree(graph.edge_index[0], num_nodes=graph.num_nodes)
        assert degrees.max().item() <= max_degree, 'Max degree is less than the maximum degree in the graph'

        neighborhood = self.get_k_hop_neighbors(graph, self.num_layers)
        num_features = int(np.log(max_degree) / np.log(self.buckets)) + 1

        structural_embedding = torch.zeros(graph.num_nodes, num_features)
        for n in range(graph.num_nodes):
            for layer in neighborhood[n].keys():
                neighbors = neighborhood[n][layer]
                if len(neighbors) > 0:
                    # get degree sequence
                    neighbors_degrees = degrees[neighbors]
                    filtered_neighbors_degrees = neighbors_degrees[neighbors_degrees > 0]
                    index_vec = (torch.log(filtered_neighbors_degrees) / np.log(self.buckets)).to(torch.int)
                    structural_embedding[n, :] += torch.bincount(index_vec, minlength=num_features) * (self.alpha ** layer)

        return structural_embedding

    @staticmethod
    def get_k_hop_neighbors(graph, k):
        adj = to_dense_adj(graph.edge_index, max_num_nodes=graph.num_nodes).squeeze().to(torch.bool)

        neighborhoods = {node: {0: torch.tensor([node])} for node in range(graph.num_nodes)}
        for node in range(graph.num_nodes):
            last_neighbor_vec = torch.zeros(graph.num_nodes, dtype=torch.bool)
            last_neighbor_vec[node] = True
            for layer in range(1, k + 1):
                neighbor_vec = (adj[neighborhoods[node][layer - 1]].sum(dim=0) > 0) & ~last_neighbor_vec
                if neighbor_vec.sum() == 0:
                    break
                neighborhoods[node][layer] = torch.where(neighbor_vec)[0]
                last_neighbor_vec |= neighbor_vec

        return neighborhoods