Source code for PlanetAlign.datasets.dbp15k

from typing import Union, Optional
from pathlib import Path
import os
import torch

from PlanetAlign.data import Dataset

from .utils import download_file_from_google_drive



[docs]
class DBP15K_FR_EN(Dataset):
    """A pair of French to English version of multi-lingual DBpedia networks. The dataset is proposed by the
    `"Cross-lingual Entity Alignment via Joint Attribute-Preserving Embedding" <https://arxiv.org/abs/1708.05045>`_ paper,
    and the node attributes are given by pre-trained and aligned monolingual word embeddings from the
    `"Cross-lingual Knowledge Graph Alignment via Graph Matching Neural Network" <https://arxiv.org/abs/1905.11605>`_ paper.
    There are 15,000 pairs of aligned entities in DBP15K (French to English).

    .. list-table::
        :widths: 10 10 10 10 10
        :header-rows: 1

        * - Graph
          - #nodes
          - #edges
          - #node attrs
          - #edge attrs
        * - FR
          - 19,661
          - 105,997
          - 300
          - 0
        * - EN
          - 19,993
          - 115,722
          - 300
          - 0
    """
    def __init__(self,
                 root: Union[str, Path],
                 download: Optional[bool] = False,
                 train_ratio: Optional[float] = 0.2,
                 dtype: torch.dtype = torch.float32,
                 seed: Optional[int] = 0):

        if download:
            download_file_from_google_drive(
                remote_file_id='1_uMPync1Y1yXsNPldtqGAG7VRLmtlt2b',
                save_filename='dbp15k_fr-en.pt',
                root=root)

        if not self._check_integrity(root):
            raise RuntimeError('DBP15K_FR-EN dataset not found or corrupted. You can use download=True to download it')

        super(DBP15K_FR_EN, self).__init__(root=root, name='dbp15k_fr-en', train_ratio=train_ratio, dtype=dtype, seed=seed)

    def _check_integrity(self, root):
        return os.path.exists(os.path.join(root, 'dbp15k_fr-en.pt'))




[docs]
class DBP15K_JA_EN(Dataset):
    """A pair of Japanese to English version of multi-lingual DBpedia networks. The dataset is proposed by the
    `"Cross-lingual Entity Alignment via Joint Attribute-Preserving Embedding" <https://arxiv.org/abs/1708.05045>`_ paper,
    and the node attributes are given by pre-trained and aligned monolingual word embeddings from the
    `"Cross-lingual Knowledge Graph Alignment via Graph Matching Neural Network" <https://arxiv.org/abs/1905.11605>`_ paper.
    There are 15,000 pairs of aligned entities in DBP15K (Japanese to English).

    .. list-table::
        :widths: 10 10 10 10 10
        :header-rows: 1

        * - Graph
          - #nodes
          - #edges
          - #node attrs
          - #edge attrs
        * - JA
          - 19,814
          - 77,214
          - 300
          - 0
        * - EN
          - 19,780
          - 93,484
          - 300
          - 0
    """
    def __init__(self,
                 root: Union[str, Path],
                 download: Optional[bool] = False,
                 train_ratio: Optional[float] = 0.2,
                 dtype: torch.dtype = torch.float32,
                 seed: Optional[int] = 0):

        if download:
            download_file_from_google_drive(
                remote_file_id='1Xc4sWilV7v804JZHRQqo2LzAM8O2_lib',
                save_filename='dbp15k_ja-en.pt',
                root=root)

        if not self._check_integrity(root):
            raise RuntimeError('DBP15K_JA-EN dataset not found or corrupted. You can use download=True to download it')

        super(DBP15K_JA_EN, self).__init__(root=root, name='dbp15k_ja-en', train_ratio=train_ratio, dtype=dtype, seed=seed)

    def _check_integrity(self, root):
        return os.path.exists(os.path.join(root, 'dbp15k_ja-en.pt'))




[docs]
class DBP15K_ZH_EN(Dataset):
    """A pair of Chinese to English version of multi-lingual DBpedia networks. The dataset is proposed by the
    `"Cross-lingual Entity Alignment via Joint Attribute-Preserving Embedding" <https://arxiv.org/abs/1708.05045>`_ paper,
    and the node attributes are given by pre-trained and aligned monolingual word embeddings from the
    `"Cross-lingual Knowledge Graph Alignment via Graph Matching Neural Network" <https://arxiv.org/abs/1905.11605>`_ paper.
    There are 15,000 pairs of aligned entities in DBP15K (Chinese to English).

    .. list-table::
        :widths: 10 10 10 10 10
        :header-rows: 1

        * - Graph
          - #nodes
          - #edges
          - #node attrs
          - #edge attrs
        * - ZH
          - 19,388
          - 70,414
          - 300
          - 0
        * - EN
          - 19,572
          - 95,142
          - 300
          - 0
    """
    def __init__(self,
                 root: Union[str, Path],
                 download: Optional[bool] = False,
                 train_ratio: Optional[float] = 0.2,
                 dtype: torch.dtype = torch.float32,
                 seed: Optional[int] = 0):

        if download:
            download_file_from_google_drive(
                remote_file_id='1ccub0If-q6_6MotYVOyFZzSc5SDJ2Izr',
                save_filename='dbp15k_zh-en.pt',
                root=root)

        if not self._check_integrity(root):
            raise RuntimeError('DBP15K_ZH-EN dataset not found or corrupted. You can use download=True to download it')

        super(DBP15K_ZH_EN, self).__init__(root=root, name='dbp15k_zh-en', train_ratio=train_ratio, dtype=dtype, seed=seed)

    def _check_integrity(self, root):
        return os.path.exists(os.path.join(root, 'dbp15k_zh-en.pt'))