Source code for PlanetAlign.datasets.arxiv

from typing import Union, Optional
from pathlib import Path
import os
import torch

from PlanetAlign.data import Dataset
from .utils import download_file_from_google_drive



[docs]
class ArXiv(Dataset):
    """
    A pair of networks synthesized from the `Arxiv ASTRO-PH (Astro Physics) collaboration network
    <https://snap.stanford.edu/data/ca-AstroPh.html>`_. Nodes represent authors and an edge exists between two authors
    if they have co-authored a paper. The two networks are noisy permutations of the original network generated by
    randomly inserting 10% edges (ArXiv1) and deleting 15% edges (ArXiv2) from the original network, respectively.
    Node and edge attributes are not available. There are in total 18,722 common nodes across two networks.

    .. list-table::
        :widths: 10 10 10 10 10
        :header-rows: 1

        * - Graph
          - #nodes
          - #edges
          - #node attrs
          - #edge attrs
        * - ArXiv1
          - 18,722
          - 217,921
          - 0
          - 0
        * - ArXiv2
          - 18,722
          - 168,394
          - 0
          - 0
    """
    def __init__(self,
                 root: Union[str, Path],
                 download: Optional[bool] = False,
                 train_ratio: Optional[float] = 0.2,
                 dtype: torch.dtype = torch.float32,
                 seed: Optional[int] = 0):

        if download:
            download_file_from_google_drive(
                remote_file_id='18SDQF7z1rUTmkl2-tzvUzpUg7JL2SvvV',
                save_filename='arxiv.pt',
                root=root)

        if not self._check_integrity(root):
            raise RuntimeError('ArXiv dataset not found or corrupted. You can use download=True to download it')

        super(ArXiv, self).__init__(root=root, name='arxiv', train_ratio=train_ratio, dtype=dtype, seed=seed)

    def _check_integrity(self, root):
        return os.path.exists(os.path.join(root, 'arxiv.pt'))