Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions configs/dataset/graph/MUTAG_link_pred.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.TUDatasetLoader
parameters:
data_domain: graph
data_type: TUDataset
data_name: MUTAG
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
num_features:
- 7 # initial node features
- 4 # initial edge features
num_classes: 2
task: classification
loss_type: cross_entropy
monitor_metric: auroc
task_level: edge
# Lifting parameters
max_dim_if_lifted: 3 # This is the maximum dimension of the simplicial complex in the dataset
preserve_edge_attr_if_lifted: True

#splits
split_params:
learning_setting: inductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0
split_type: random #'k-fold' # either "k-fold" or "random" strategies
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.5 # for "random" strategy splitting
val_prop: 0.1
test_prop: 0.1
is_undirected: True
neg_sampling_ratio: 1.0 # in val/test datasets
neg_pos_ratio: 1.0 # in train dataset
neg_sampling_method: sparse
task_level: edge

# Dataloader parameters
dataloader_params:
batch_size: 10
num_workers: 0
pin_memory: False
39 changes: 39 additions & 0 deletions configs/dataset/graph/PPI_link_pred.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.PPIDatasetLoader
parameters:
data_domain: graph
data_type: PPI
data_name: PPI
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
num_features: 50
num_classes: 2
task: classification
loss_type: cross_entropy
monitor_metric: auroc
task_level: edge

#splits
split_params:
learning_setting: inductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0
split_type: random #'k-fold' # either "k-fold" or "random" strategies
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.5 # for "random" strategy splitting
val_prop: 0.1
test_prop: 0.1
is_undirected: True
neg_sampling_ratio: 1.0 # in val/test datasets
neg_pos_ratio: 1.0 # in train dataset
neg_sampling_method: sparse
task_level: edge

# Dataloader parameters
dataloader_params:
batch_size: 10
num_workers: 0
pin_memory: False
40 changes: 40 additions & 0 deletions configs/dataset/graph/cocitation_cora_link_pred.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Dataset loader config
loader:
_target_: topobench.data.loaders.PlanetoidDatasetLoader
parameters:
data_domain: graph
data_type: cocitation
data_name: Cora
data_dir: ${paths.data_dir}/${dataset.loader.parameters.data_domain}/${dataset.loader.parameters.data_type}

# Dataset parameters
parameters:
num_features: 1433
num_classes: 2 # no of classes: edge exists / does not exist
num_nodes: 2708
task: classification
loss_type: cross_entropy # cross_entropy
monitor_metric: auroc # accuracy
task_level: edge

#splits
split_params:
learning_setting: transductive
data_split_dir: ${paths.data_dir}/data_splits/${dataset.loader.parameters.data_name}
data_seed: 0
split_type: random #'k-fold' # either "k-fold" or "random" strategies
k: 10 # for "k-fold" Cross-Validation
train_prop: 0.8 # for "random" strategy splitting
val_prop: 0.1
test_prop: 0.1
is_undirected: True
neg_sampling_ratio: 1.0 # in val/test datasets
neg_pos_ratio: 1.0 # in train dataset
neg_sampling_method: sparse
task_level: edge

# Dataloader parameters
dataloader_params:
batch_size: 1 # Fixed
num_workers: 1
pin_memory: False
41 changes: 41 additions & 0 deletions configs/model/graph/gat_link_pred.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
_target_: topobench.model.TBModel

model_name: gat
model_domain: graph

feature_encoder:
_target_: topobench.nn.encoders.${model.feature_encoder.encoder_name}
encoder_name: AllCellFeatureEncoder
in_channels: ${infer_in_channels:${dataset},${oc.select:transforms,null}}
out_channels: 32
proj_dropout: 0.0

backbone:
_target_: torch_geometric.nn.models.GAT
in_channels: ${model.feature_encoder.out_channels}
hidden_channels: ${model.feature_encoder.out_channels}
num_layers: 1
dropout: 0.0
act: relu
v2: true
heads: 4
concat: true

backbone_wrapper:
_target_: topobench.nn.wrappers.GNNWrapper
_partial_: true
wrapper_name: GNNWrapper
out_channels: ${model.feature_encoder.out_channels}
num_cell_dimensions: ${infer_num_cell_dimensions:${oc.select:model.feature_encoder.selected_dimensions,null},${model.feature_encoder.in_channels}}

readout:
_target_: topobench.nn.readouts.${model.readout.readout_name}
readout_name: LinkPredictionReadOut # Use <NoReadOut> in case readout is not needed Options: PropagateSignalDown
num_cell_dimensions: ${infer_num_cell_dimensions:${oc.select:model.feature_encoder.selected_dimensions,null},${model.feature_encoder.in_channels}} # The highest order of cell dimensions to consider
hidden_dim: ${model.feature_encoder.out_channels}
out_channels: ${dataset.parameters.num_classes}
task_level: ${dataset.parameters.task_level}
pooling_type: sum

# compile model for faster training with pytorch 2.0
compile: false
38 changes: 38 additions & 0 deletions configs/model/graph/gcn_link_pred.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
_target_: topobench.model.TBModel

model_name: gcn
model_domain: graph

feature_encoder:
_target_: topobench.nn.encoders.${model.feature_encoder.encoder_name}
encoder_name: AllCellFeatureEncoder
in_channels: ${infer_in_channels:${dataset},${oc.select:transforms,null}}
out_channels: 64
proj_dropout: 0.0

backbone:
_target_: torch_geometric.nn.models.GCN
in_channels: ${model.feature_encoder.out_channels}
hidden_channels: ${model.feature_encoder.out_channels}
num_layers: 2
dropout: 0.0
act: relu

backbone_wrapper:
_target_: topobench.nn.wrappers.GNNWrapper
_partial_: true
wrapper_name: GNNWrapper
out_channels: ${model.feature_encoder.out_channels}
num_cell_dimensions: ${infer_num_cell_dimensions:${oc.select:model.feature_encoder.selected_dimensions,null},${model.feature_encoder.in_channels}}

readout:
_target_: topobench.nn.readouts.${model.readout.readout_name}
readout_name: LinkPredictionReadOut # Use <NoReadOut> in case readout is not needed Options: PropagateSignalDown
num_cell_dimensions: ${infer_num_cell_dimensions:${oc.select:model.feature_encoder.selected_dimensions,null},${model.feature_encoder.in_channels}} # The highest order of cell dimensions to consider
hidden_dim: ${model.feature_encoder.out_channels}
out_channels: ${dataset.parameters.num_classes}
task_level: ${dataset.parameters.task_level}
pooling_type: sum

# compile model for faster training with pytorch 2.0
compile: false
6 changes: 4 additions & 2 deletions test/_utils/simplified_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,17 @@ def run(cfg: DictConfig) -> DictConfig:
preprocessor.load_dataset_splits(cfg.dataset.split_params)
)
# Prepare datamodule
if cfg.dataset.parameters.task_level in ["node", "graph"]:
task_level = cfg.dataset.parameters.task_level

if task_level in ["node", "graph", "edge"]:
datamodule = TBDataloader(
dataset_train=dataset_train,
dataset_val=dataset_val,
dataset_test=dataset_test,
**cfg.dataset.get("dataloader_params", {}),
)
else:
raise ValueError("Invalid task_level")
raise ValueError(f"Invalid task_level: {task_level}")

# Model for us is Network + logic: inputs backbone, readout, losses
model = hydra.utils.instantiate(
Expand Down
114 changes: 114 additions & 0 deletions test/data/utils/test_split_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pytest
import numpy as np
import torch
from torch_geometric.data import Data
from unittest.mock import MagicMock, patch
from omegaconf import DictConfig

Expand All @@ -15,6 +16,11 @@
load_inductive_splits,
load_transductive_splits,
assign_train_val_test_mask_to_graphs,
load_edge_transductive_splits,
load_edge_inductive_splits,
)
from topobench.transforms.data_manipulations.negative_links_sampling import (
NegativeSamplingTransform,
)


Expand Down Expand Up @@ -473,3 +479,111 @@ def test_assign_masks(self):
assert len(train_ds) == 5
assert len(val_ds) == 3
assert len(test_ds) == 2

class TestEdgeSplits:
"""Tests for edge-level split utilities."""

def test_load_edge_transductive_splits_basic(self):
"""Basic sanity check for transductive edge-level splits."""
# Simple toy graph with 3 nodes and 3 edges
edge_index = torch.tensor([[0, 1, 2], [1, 2, 0]])
data = Data(edge_index=edge_index, num_nodes=3)

# Mock preprocessor with a single-graph dataset
preprocessor = MagicMock()
preprocessor.dataset = [data]

split_params = DictConfig({
"val_prop": 0.2,
"test_prop": 0.2,
"is_undirected": True,
"neg_pos_ratio": 1.0,
"neg_sampling_method": "sparse",
})

dataset_train, dataset_val, dataset_test = load_edge_transductive_splits(
preprocessor, split_params
)

# One graph per split (wrapped in DataloadDataset)
assert len(dataset_train) == 1
assert len(dataset_val) == 1
assert len(dataset_test) == 1

# Train split: only positive labels in the stored Data object
train_data = dataset_train.data_lst[0]
assert hasattr(train_data, "edge_label_index")
assert hasattr(train_data, "edge_label")
assert train_data.edge_label.numel() > 0
assert train_data.edge_label.unique().tolist() == [1]

# Dynamic negative sampling transform should be attached
assert isinstance(dataset_train._dynamic_transform, NegativeSamplingTransform)

# Val/Test splits: labels (if any) should be 0/1
val_data = dataset_val.data_lst[0]
test_data = dataset_test.data_lst[0]

assert hasattr(val_data, "edge_label_index")
assert hasattr(val_data, "edge_label")
vals = val_data.edge_label.unique().tolist()
assert all(v in (0, 1) for v in vals)

assert hasattr(test_data, "edge_label_index")
assert hasattr(test_data, "edge_label")
vals_test = test_data.edge_label.unique().tolist()
assert all(v in (0, 1) for v in vals_test)

def test_load_edge_inductive_splits_basic(self):
"""Basic sanity check for inductive edge-level splits."""
# Create a small multi-graph dataset (e.g. 9 graphs)
n_graphs = 9
data_list = []
for i in range(n_graphs):
edge_index = torch.tensor([[0, 1], [1, 0]]) # simple 2-node edge
y = torch.tensor([i % 2]) # dummy graph label
data_list.append(Data(edge_index=edge_index, num_nodes=2, y=y))

# Mock preprocessor with multi-graph dataset
preprocessor = MagicMock()
preprocessor.dataset = data_list

with tempfile.TemporaryDirectory() as tmpdir:
split_params = DictConfig({
"split_type": "random",
"data_seed": 0,
"train_prop": 0.67,
"data_split_dir": os.path.join(tmpdir, "data_splits"),
"neg_pos_ratio": 1.0,
"neg_sampling_method": "sparse",
"neg_sampling_ratio": 1.0,
})

dataset_train, dataset_val, dataset_test = load_edge_inductive_splits(
preprocessor, split_params
)

# Splits should together cover all graphs
total = len(dataset_train) + len(dataset_val) + len(dataset_test)
assert total == n_graphs

# At least train and test should be non-empty
assert len(dataset_train) > 0
assert len(dataset_test) > 0

# Train graphs: only positives stored, negatives added dynamically
for d in dataset_train.data_lst:
assert hasattr(d, "edge_label_index")
assert hasattr(d, "edge_label")
assert d.edge_label.numel() > 0
assert d.edge_label.unique().tolist() == [1]

assert isinstance(dataset_train._dynamic_transform, NegativeSamplingTransform)

# Val/Test graphs: (if non-empty) should have 0/1 labels after static negatives
for ds in (dataset_val, dataset_test):
for d in ds.data_lst:
assert hasattr(d, "edge_label_index")
assert hasattr(d, "edge_label")
vals = d.edge_label.unique().tolist()
assert all(v in (0, 1) for v in vals)
Loading