Source code for PlanetAlign.datasets.dbp15k

from typing import Union, Optional
from pathlib import Path
import os
import torch

from PlanetAlign.data import Dataset

from .utils import download_file_from_google_drive


[docs] class DBP15K_FR_EN(Dataset): """A pair of French to English version of multi-lingual DBpedia networks. The dataset is proposed by the `"Cross-lingual Entity Alignment via Joint Attribute-Preserving Embedding" <https://arxiv.org/abs/1708.05045>`_ paper, and the node attributes are given by pre-trained and aligned monolingual word embeddings from the `"Cross-lingual Knowledge Graph Alignment via Graph Matching Neural Network" <https://arxiv.org/abs/1905.11605>`_ paper. There are 15,000 pairs of aligned entities in DBP15K (French to English). .. list-table:: :widths: 10 10 10 10 10 :header-rows: 1 * - Graph - #nodes - #edges - #node attrs - #edge attrs * - FR - 19,661 - 105,997 - 300 - 0 * - EN - 19,993 - 115,722 - 300 - 0 """ def __init__(self, root: Union[str, Path], download: Optional[bool] = False, train_ratio: Optional[float] = 0.2, dtype: torch.dtype = torch.float32, seed: Optional[int] = 0): if download: download_file_from_google_drive( remote_file_id='1_uMPync1Y1yXsNPldtqGAG7VRLmtlt2b', save_filename='dbp15k_fr-en.pt', root=root) if not self._check_integrity(root): raise RuntimeError('DBP15K_FR-EN dataset not found or corrupted. You can use download=True to download it') super(DBP15K_FR_EN, self).__init__(root=root, name='dbp15k_fr-en', train_ratio=train_ratio, dtype=dtype, seed=seed) def _check_integrity(self, root): return os.path.exists(os.path.join(root, 'dbp15k_fr-en.pt'))
[docs] class DBP15K_JA_EN(Dataset): """A pair of Japanese to English version of multi-lingual DBpedia networks. The dataset is proposed by the `"Cross-lingual Entity Alignment via Joint Attribute-Preserving Embedding" <https://arxiv.org/abs/1708.05045>`_ paper, and the node attributes are given by pre-trained and aligned monolingual word embeddings from the `"Cross-lingual Knowledge Graph Alignment via Graph Matching Neural Network" <https://arxiv.org/abs/1905.11605>`_ paper. There are 15,000 pairs of aligned entities in DBP15K (Japanese to English). .. list-table:: :widths: 10 10 10 10 10 :header-rows: 1 * - Graph - #nodes - #edges - #node attrs - #edge attrs * - JA - 19,814 - 77,214 - 300 - 0 * - EN - 19,780 - 93,484 - 300 - 0 """ def __init__(self, root: Union[str, Path], download: Optional[bool] = False, train_ratio: Optional[float] = 0.2, dtype: torch.dtype = torch.float32, seed: Optional[int] = 0): if download: download_file_from_google_drive( remote_file_id='1Xc4sWilV7v804JZHRQqo2LzAM8O2_lib', save_filename='dbp15k_ja-en.pt', root=root) if not self._check_integrity(root): raise RuntimeError('DBP15K_JA-EN dataset not found or corrupted. You can use download=True to download it') super(DBP15K_JA_EN, self).__init__(root=root, name='dbp15k_ja-en', train_ratio=train_ratio, dtype=dtype, seed=seed) def _check_integrity(self, root): return os.path.exists(os.path.join(root, 'dbp15k_ja-en.pt'))
[docs] class DBP15K_ZH_EN(Dataset): """A pair of Chinese to English version of multi-lingual DBpedia networks. The dataset is proposed by the `"Cross-lingual Entity Alignment via Joint Attribute-Preserving Embedding" <https://arxiv.org/abs/1708.05045>`_ paper, and the node attributes are given by pre-trained and aligned monolingual word embeddings from the `"Cross-lingual Knowledge Graph Alignment via Graph Matching Neural Network" <https://arxiv.org/abs/1905.11605>`_ paper. There are 15,000 pairs of aligned entities in DBP15K (Chinese to English). .. list-table:: :widths: 10 10 10 10 10 :header-rows: 1 * - Graph - #nodes - #edges - #node attrs - #edge attrs * - ZH - 19,388 - 70,414 - 300 - 0 * - EN - 19,572 - 95,142 - 300 - 0 """ def __init__(self, root: Union[str, Path], download: Optional[bool] = False, train_ratio: Optional[float] = 0.2, dtype: torch.dtype = torch.float32, seed: Optional[int] = 0): if download: download_file_from_google_drive( remote_file_id='1ccub0If-q6_6MotYVOyFZzSc5SDJ2Izr', save_filename='dbp15k_zh-en.pt', root=root) if not self._check_integrity(root): raise RuntimeError('DBP15K_ZH-EN dataset not found or corrupted. You can use download=True to download it') super(DBP15K_ZH_EN, self).__init__(root=root, name='dbp15k_zh-en', train_ratio=train_ratio, dtype=dtype, seed=seed) def _check_integrity(self, root): return os.path.exists(os.path.join(root, 'dbp15k_zh-en.pt'))