Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ source code at: `src\models`
| DA-MRS | [Improving Multi-modal Recommender Systems by Denoising and Aligning Multi-modal Content and User Feedback](https://dl.acm.org/doi/10.1145/3637528.3671703) | KDD'24 | damrs.py |
| SMORE | [Spectrum-based Modality Representation Fusion Graph Convolutional Network for Multimodal Recommendation](https://arxiv.org/abs/2412.14978) | WSDM'25 | smore.py |
| PGL | [Mind Individual Information! Principal Graph Learning for Multimedia Recommendation](https://ojs.aaai.org/index.php/AAAI/article/view/33429) | AAAI'25 | pgl.py |
| MMGCF | [Multimodal Graph Collaborative Filtering for Recommendation with Graph Convolutional Networks](https://dl.acm.org/doi/abs/10.1145/3774935.3806158) | UMAP'26 | mmgcf.py |


#### Please consider to cite our paper if this framework helps you, thanks:
Expand Down
39 changes: 39 additions & 0 deletions src/configs/model/MMGCF.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# MMGCF – Multimodal Graph Collaborative Filtering
# ──────────────────────────────────────────────────────────────────────────
# Model hyperparameters
# ──────────────────────────────────────────────────────────────────────────

# Dimension of user/item ID embeddings (LightGCN output size).
embedding_size: 64

# Output dimension of the visual/text feature projection layers.
# For non-concat fusion modes (mean, sum), this MUST equal
# embedding_size so that ID and modal embeddings can be combined
# element-wise. Only concat mode handles differing dimensions natively.
feat_embed_dim: 64

# Number of LightGCN propagation layers over the user-item graph.
n_ui_layers: [1,2,3]

# L2 regularisation weight applied to initial (un-propagated) ID embeddings.
reg_weight: [1e-1, 1e-2, 1e-3, 1e-4]

# How to combine the ID embedding with multimodal feature embeddings.
# Options: mean | sum | concat
fusion_mode: ['mean','sum','concat']

# How to weight ID vs. modal embeddings before/during fusion.
# equal – fuse modalities first (mean/sum/prod/concat), then fuse
# with ID embedding at equal weight [recommended default]
# alpha – learnable sigmoid gate α; ID emb scaled by α,
# each modal feat scaled by (1−α), then jointly fused
# normalized – L2-normalise every embedding before fusing; ID emb is
# additionally scaled by n_modalities for balance
weighting: ['equal', 'alpha', 'normalized']

# Edge dropout probability applied before each training epoch.
# Edges are sampled proportional to their normalised adjacency weight
# (degree-sensitive pruning). Set to 0.0 to disable.
dropout: [0.2, 0.5, 0.8]

hyper_parameters: ['n_ui_layers', 'reg_weight', 'fusion_mode', 'weighting', 'dropout']
289 changes: 289 additions & 0 deletions src/models/mmgcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
# coding: utf-8
"""
MMGCF: Multimodal Graph Collaborative Filtering
Late-fusion of LightGCN ID embeddings with multimodal item features.

Fusion modes : mean | sum | concat
Weighting modes: equal | alpha | normalized

Adapted for the MMRec framework.
NOTE: For non-concat fusion modes, feat_embed_dim must equal embedding_size
so that ID and modal embeddings can be stacked/added element-wise.
"""

import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F

from common.abstract_recommender import GeneralRecommender


class MMGCF(GeneralRecommender):
def __init__(self, config, dataset):
super(MMGCF, self).__init__(config, dataset)

self.embedding_dim = config['embedding_size']
self.feat_embed_dim = config['feat_embed_dim']
self.n_ui_layers = config['n_ui_layers']
self.reg_weight = config['reg_weight']
self.fusion_mode = config['fusion_mode'] # mean | sum | concat
self.weighting = config['weighting'] # equal | alpha | normalized
self.dropout = config['dropout']

self.n_nodes = self.n_users + self.n_items

# ── interaction matrix & adjacency ────────────────────────────────
self.interaction_matrix = dataset.inter_matrix(form='coo').astype(np.float32)
self.norm_adj = self.get_norm_adj_mat().to(self.device)
self.masked_adj = None
self.edge_indices, self.edge_values = self.get_edge_info()
self.edge_indices = self.edge_indices.to(self.device)
self.edge_values = self.edge_values.to(self.device)

# ── ID embeddings ─────────────────────────────────────────────────
self.user_embedding = nn.Embedding(self.n_users, self.embedding_dim)
self.item_id_embedding = nn.Embedding(self.n_items, self.embedding_dim)
nn.init.xavier_uniform_(self.user_embedding.weight)
nn.init.xavier_uniform_(self.item_id_embedding.weight)

# ── multimodal feature embeddings & projection layers ─────────────
# Raw pretrained features are stored in freezed Embeddings;
# a learnable Linear layer projects them to feat_embed_dim on every forward pass.
self.n_modalities = 0
if self.v_feat is not None:
self.image_embedding = nn.Embedding.from_pretrained(self.v_feat, freeze=True)
self.image_trs = nn.Linear(self.v_feat.shape[1], self.feat_embed_dim)
self.n_modalities += 1
if self.t_feat is not None:
self.text_embedding = nn.Embedding.from_pretrained(self.t_feat, freeze=True)
self.text_trs = nn.Linear(self.t_feat.shape[1], self.feat_embed_dim)
self.n_modalities += 1

# ── weighting: learnable α scalar (only used when weighting='alpha') ──
if self.weighting == 'alpha':
# α = sigmoid(mm_alpha); item_emb weighted by α, mm feats by (1-α)
self.mm_alpha = nn.Parameter(torch.tensor(0.0))

# ── concat fusion layers ──────────────────────────────────────────
# Three separate Linear layers are used so each concat step has its own
# fixed input size, avoiding the LazyLinear shape-collision bug in the
# original code when equal+concat are combined.
if self.fusion_mode == 'concat' and self.n_modalities > 0:
# Used by 'alpha' and 'normalized': concat [id_emb, mm1, ..., mmN]
all_in = self.embedding_dim + self.n_modalities * self.feat_embed_dim
self.all_concat_layer = nn.Linear(all_in, self.embedding_dim)

# Used by 'equal' step-1: concat all modality feats (only when >1 mod)
if self.n_modalities > 1:
mm_in = self.n_modalities * self.feat_embed_dim
self.mm_concat_layer = nn.Linear(mm_in, self.feat_embed_dim)

# Used by 'equal' step-2: concat [id_emb, fused_mm]
id_mm_in = self.embedding_dim + self.feat_embed_dim
self.id_mm_concat_layer = nn.Linear(id_mm_in, self.embedding_dim)

# ══════════════════════════════════════════════════════════════════════
# Graph helpers
# ══════════════════════════════════════════════════════════════════════

def get_norm_adj_mat(self):
"""Build symmetric normalised Laplacian for the user-item bipartite graph."""
A = sp.dok_matrix(
(self.n_users + self.n_items, self.n_users + self.n_items),
dtype=np.float32,
)
inter_M = self.interaction_matrix
inter_M_t = self.interaction_matrix.transpose()
# fill upper-right and lower-left blocks
data_dict = dict(zip(zip(inter_M.row, inter_M.col + self.n_users),
[1] * inter_M.nnz))
data_dict.update(dict(zip(zip(inter_M_t.row + self.n_users, inter_M_t.col),
[1] * inter_M_t.nnz)))
# A._update(data_dict)
for (row, col), value in data_dict.items():
A[row, col] = value
# D^{-1/2} A D^{-1/2}
sumArr = (A > 0).sum(axis=1)
diag = np.power(np.array(sumArr.flatten())[0] + 1e-7, -0.5)
D = sp.diags(diag)
L = sp.coo_matrix(D * A * D)
i = torch.LongTensor(np.array([L.row, L.col]))
data = torch.FloatTensor(L.data)
return torch.sparse.FloatTensor(i, data, torch.Size((self.n_nodes, self.n_nodes)))

def _normalize_adj_m(self, indices, adj_size):
"""Compute D_u^{-1/2} * D_i^{-1/2} edge weights for a bipartite sub-graph."""
adj = torch.sparse.FloatTensor(
indices, torch.ones_like(indices[0], dtype=torch.float32), adj_size
)
row_sum = 1e-7 + torch.sparse.sum(adj, -1).to_dense()
col_sum = 1e-7 + torch.sparse.sum(adj.t(), -1).to_dense()
return torch.pow(row_sum, -0.5)[indices[0]] * torch.pow(col_sum, -0.5)[indices[1]]

def get_edge_info(self):
rows = torch.from_numpy(self.interaction_matrix.row)
cols = torch.from_numpy(self.interaction_matrix.col)
edges = torch.stack([rows, cols]).type(torch.LongTensor)
vals = self._normalize_adj_m(edges, torch.Size((self.n_users, self.n_items)))
return edges, vals

def pre_epoch_processing(self):
"""
Degree-sensitive edge dropout (disabled when dropout <= 0).
Edges with higher normalised weight are more likely to be kept,
encouraging the model to retain informative interactions.
"""
if self.dropout <= 0.0:
self.masked_adj = self.norm_adj
return
degree_len = int(self.edge_values.size(0) * (1.0 - self.dropout))
degree_idx = torch.multinomial(self.edge_values, degree_len)
keep_idx = self.edge_indices[:, degree_idx]
keep_vals = self._normalize_adj_m(keep_idx, torch.Size((self.n_users, self.n_items)))
all_vals = torch.cat((keep_vals, keep_vals))
keep_idx[1] += self.n_users # shift item indices into the joint space
all_idx = torch.cat((keep_idx, torch.flip(keep_idx, [0])), dim=1)
self.masked_adj = torch.sparse.FloatTensor(
all_idx, all_vals, self.norm_adj.shape
).to(self.device)

# ══════════════════════════════════════════════════════════════════════
# LightGCN propagation
# ══════════════════════════════════════════════════════════════════════

def lightgcn_propagate(self, adj):
"""
Standard LightGCN: layer-wise neighbourhood aggregation over the
user-item graph, followed by mean-pooling across all layer outputs.
Returns (user_emb, item_emb) each of shape (N, embedding_dim).
"""
ego = torch.cat(
[self.user_embedding.weight, self.item_id_embedding.weight], dim=0
)
layers = [ego]
for _ in range(self.n_ui_layers):
ego = torch.sparse.mm(adj, ego)
layers.append(ego)
# mean of all layer embeddings (LightGCN aggregation)
out = torch.stack(layers, dim=1).mean(dim=1)
return torch.split(out, [self.n_users, self.n_items], dim=0)

# ══════════════════════════════════════════════════════════════════════
# Multimodal fusion
# ══════════════════════════════════════════════════════════════════════

def _get_mm_feats(self):
"""Project raw pretrained features into feat_embed_dim."""
feats = []
if self.v_feat is not None:
feats.append(self.image_trs(self.image_embedding.weight))
if self.t_feat is not None:
feats.append(self.text_trs(self.text_embedding.weight))
return feats # list of (n_items, feat_embed_dim) tensors

def _apply_fusion(self, tensors, concat_layer=None):
"""
Fuse a list of tensors according to self.fusion_mode.

mean / sum — element-wise ops; all tensors must share shape.
concat — cat along last dim, then project via concat_layer.
"""
if self.fusion_mode == 'mean':
return torch.stack(tensors).mean(dim=0)
elif self.fusion_mode == 'sum':
return torch.stack(tensors).sum(dim=0)
else: # concat
return concat_layer(torch.cat(tensors, dim=-1))

def fuse_item_embeddings(self, item_emb):
"""
Late-fuse LightGCN item embeddings with multimodal features.

Weighting strategies
--------------------
alpha : learnable sigmoid gate; item_emb scaled by α,
each modal feat scaled by (1−α), then jointly fused.
normalized : L2-normalise every embedding before fusing; the ID
embedding is additionally scaled by n_modalities so
all modalities contribute roughly equally.
equal : two-stage — first fuse modalities together, then
fuse the result with the ID embedding (equal weight).
"""
mm_feats = self._get_mm_feats()
if not mm_feats:
return item_emb

if self.weighting == 'alpha':
alpha = torch.sigmoid(self.mm_alpha)
tensors = [item_emb * alpha] + [f * (1.0 - alpha) for f in mm_feats]
return self._apply_fusion(
tensors,
concat_layer=self.all_concat_layer if self.fusion_mode == 'concat' else None,
)

elif self.weighting == 'normalized':
tensors = (
[F.normalize(item_emb) * self.n_modalities]
+ [F.normalize(f) for f in mm_feats]
)
return self._apply_fusion(
tensors,
concat_layer=self.all_concat_layer if self.fusion_mode == 'concat' else None,
)

else: # equal
# Step 1: reduce multiple modalities to a single mm embedding
if len(mm_feats) > 1:
mm_fused = self._apply_fusion(
mm_feats,
concat_layer=self.mm_concat_layer if self.fusion_mode == 'concat' else None,
)
else:
mm_fused = mm_feats[0] # single modality — no reduction needed

# Step 2: combine ID embedding with the fused modal embedding
return self._apply_fusion(
[item_emb, mm_fused],
concat_layer=self.id_mm_concat_layer if self.fusion_mode == 'concat' else None,
)

# ══════════════════════════════════════════════════════════════════════
# Forward pass
# ══════════════════════════════════════════════════════════════════════

def forward(self, adj):
user_emb, item_emb = self.lightgcn_propagate(adj)
item_emb = self.fuse_item_embeddings(item_emb)
return user_emb, item_emb

# ══════════════════════════════════════════════════════════════════════
# Loss & prediction
# ══════════════════════════════════════════════════════════════════════

def bpr_loss(self, users, pos_items, neg_items):
pos_scores = (users * pos_items).sum(dim=1)
neg_scores = (users * neg_items).sum(dim=1)
return -F.logsigmoid(pos_scores - neg_scores).mean()

def calculate_loss(self, interaction):
users, pos_items, neg_items = interaction[0], interaction[1], interaction[2]

ua_emb, ia_emb = self.forward(self.masked_adj)
mf_loss = self.bpr_loss(ua_emb[users], ia_emb[pos_items], ia_emb[neg_items])

# L2 regularisation on the initial (un-propagated) ID embeddings only,
# consistent with the LightGCN / BPR-MF convention.
reg_loss = (
self.user_embedding.weight[users].norm(2).pow(2)
+ self.item_id_embedding.weight[pos_items].norm(2).pow(2)
+ self.item_id_embedding.weight[neg_items].norm(2).pow(2)
) / (2 * len(users))

return mf_loss + self.reg_weight * reg_loss

def full_sort_predict(self, interaction):
user = interaction[0]
user_emb, item_emb = self.forward(self.norm_adj)
return torch.matmul(user_emb[user], item_emb.t())