Source code for rectorch.samplers

r"""The ``samplers`` module that contains definitions of sampler classes useful when training neural
network-based models.

The ``samplers`` module is inspired by the :class:`torch.utils.data.DataLoader` class which,
however, is not really efficient because it outputs a single example at a time. The idea behind the
samplers defined in this module is to treat the data set at batches highly improving the efficiency.
Each new sampler must extend the base class :class:`Sampler` implementing all the abstract special
methods, in particular :meth:`samplers.Sampler.__len__` and :meth:`samplers.Sampler.__iter__`.
"""
import numpy as np
from scipy.sparse import csr_matrix, hstack
import torch
from torch.autograd import Variable

__all__ = ['Sampler', 'DataSampler', 'ConditionedDataSampler', 'EmptyConditionedDataSampler',\
    'BalancedConditionedDataSampler', 'CFGAN_TrainingSampler', 'SVAE_Sampler']

[docs]class Sampler(): r"""Sampler base class. A sampler is meant to be used as a generator of batches useful in training neural networks. Notes ----- Each new sampler must extend this base class implementing all the abstract special methods, in particular :meth:`rectorch.samplers.Sampler.__len__` and :meth:`rectorch.samplers.Sampler.__iter__`. """ def __init__(self, *args, **kargs): pass def __len__(self): """Return the number of batches. """ raise NotImplementedError def __iter__(self): """Iterate through the batches yielding a batch at a time. """ raise NotImplementedError
[docs]class DataSampler(Sampler): r"""This is a standard sampler that returns batches without any particular constraint. Bathes are randomly returned with the defined dimension (i.e., ``batch_size``). If ``shuffle`` is set to ``False`` then the sampler returns batches with the same order as in the original dataset. When ``sparse_data_te`` is defined then each returned batch is a :obj:`tuple` with the training part of the batch and its test/validation counterpart. Otherwise, if ``sparse_data_te`` is :obj:`None` then the second element of the yielded tuple will be :obj:`None`. Parameters ---------- sparse_data_tr : :obj:`scipy.sparse.csr_matrix` The training sparse user-item rating matrix. sparse_data_te : :obj:`scipy.sparse.csr_matrix` [optional] The test sparse user-item rating matrix. The shape of this matrix must be the same as ``sparse_data_tr``. By default :obj:`None`. batch_size : :obj:`int` [optional] The size of the batches, by default 1. shuffle : :obj:`bool` [optional] Whether the data set must by randomly shuffled before creating the batches, by default ``True``. Attributes ---------- sparse_data_tr : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_tr`` parameter. sparse_data_te : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_te`` parameter. batch_size : :obj:`int` See ``batch_size`` parameter. shuffle : :obj:`bool` See ``shuffle`` parameter. """ def __init__(self, sparse_data_tr, sparse_data_te=None, batch_size=1, shuffle=True): super(DataSampler, self).__init__() self.sparse_data_tr = sparse_data_tr self.sparse_data_te = sparse_data_te self.batch_size = batch_size self.shuffle = shuffle def __len__(self): return int(np.ceil(self.sparse_data_tr.shape[0] / self.batch_size)) def __iter__(self): n = self.sparse_data_tr.shape[0] idxlist = list(range(n)) if self.shuffle: np.random.shuffle(idxlist) for _, start_idx in enumerate(range(0, n, self.batch_size)): end_idx = min(start_idx + self.batch_size, n) data_tr = self.sparse_data_tr[idxlist[start_idx:end_idx]] data_tr = torch.FloatTensor(data_tr.toarray()) data_te = None if self.sparse_data_te is not None: data_te = self.sparse_data_te[idxlist[start_idx:end_idx]] data_te = torch.FloatTensor(data_te.toarray()) yield data_tr, data_te
[docs]class ConditionedDataSampler(Sampler): r"""Data sampler with conditioned filtering used by the :class:`rectorch.models.CMultiVAE` model. This data sampler is useful when training the :class:`rectorch.models.CMultiVAE` model described in [CVAE]_. During the training, each user must be conditioned over all the possible conditions (actually the ones that the user knows) so the training set must be modified accordingly. Parameters ---------- iid2cids : :obj:`dict` (key :obj:`int` - value :obj:`list` of :obj:`int`) Dictionary that maps each item to the list of all valid conditions for that item. Items are referred to with the inner id, and conditions with an integer in the range 0, ``n_cond`` -1. n_cond : :obj:`int` Number of possible conditions. sparse_data_tr : :obj:`scipy.sparse.csr_matrix` The training sparse user-item rating matrix. sparse_data_te : :obj:`scipy.sparse.csr_matrix` [optional] The test sparse user-item rating matrix. The shape of this matrix must be the same as ``sparse_data_tr``. By default :obj:`None`. batch_size : :obj:`int` [optional] The size of the batches, by default 1. shuffle : :obj:`bool` [optional] Whether the data set must bu randomly shuffled before creating the batches, by default ``True``. Attributes ---------- iid2cids : :obj:`dict` (key :obj:`int` - value :obj:`list` of :obj:`int`) See ``iid2cids`` parameter. n_cond : :obj:`int` See ``n_cond`` parameter. sparse_data_tr : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_tr`` parameter. sparse_data_te : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_te`` parameter. batch_size : :obj:`int` See ``batch_size`` parameter. shuffle : :obj:`bool` See ``shuffle`` parameter. References ---------- .. [CVAE] Tommaso Carraro, Mirko Polato and Fabio Aiolli. Conditioned Variational Autoencoder for top-N item recommendation, 2020. arXiv pre-print: https://arxiv.org/abs/2004.11141 """ def __init__(self, iid2cids, n_cond, sparse_data_tr, sparse_data_te=None, batch_size=1, shuffle=True): super(ConditionedDataSampler, self).__init__() self.sparse_data_tr = sparse_data_tr self.sparse_data_te = sparse_data_te self.iid2cids = iid2cids self.batch_size = batch_size self.n_cond = n_cond self.shuffle = shuffle self._compute_conditions() def _compute_conditions(self): r2cond = {} for i, row in enumerate(self.sparse_data_tr): _, cols = row.nonzero() r2cond[i] = set.union(*[set(self.iid2cids[c]) for c in cols]) self.examples = [(r, -1) for r in r2cond] self.examples += [(r, c) for r in r2cond for c in r2cond[r]] self.examples = np.array(self.examples) del r2cond rows = [m for m in self.iid2cids for _ in range(len(self.iid2cids[m]))] cols = [g for m in self.iid2cids for g in self.iid2cids[m]] values = np.ones(len(rows)) self.M = csr_matrix((values, (rows, cols)), shape=(len(self.iid2cids), self.n_cond)) def __len__(self): return int(np.ceil(len(self.examples) / self.batch_size)) def __iter__(self): n = len(self.examples) idxlist = list(range(n)) if self.shuffle: np.random.shuffle(idxlist) for _, start_idx in enumerate(range(0, n, self.batch_size)): end_idx = min(start_idx + self.batch_size, n) ex = self.examples[idxlist[start_idx:end_idx]] rows, cols = [], [] for i, (_, c) in enumerate(ex): if c >= 0: rows.append(i) cols.append(c) values = np.ones(len(rows)) cond_matrix = csr_matrix((values, (rows, cols)), shape=(len(ex), self.n_cond)) rows_ = [r for r, _ in ex] data_tr = hstack([self.sparse_data_tr[rows_], cond_matrix], format="csr") if self.sparse_data_te is None: self.sparse_data_te = self.sparse_data_tr for i, (_, c) in enumerate(ex): if c < 0: rows += [i] * self.n_cond cols += range(self.n_cond) values = np.ones(len(rows)) cond_matrix = csr_matrix((values, (rows, cols)), shape=(len(ex), self.n_cond)) filtered = cond_matrix.dot(self.M.transpose().tocsr()) > 0 data_te = self.sparse_data_te[rows_].multiply(filtered) filter_idx = np.diff(data_te.indptr) != 0 data_te = data_te[filter_idx] data_tr = data_tr[filter_idx] data_te = torch.FloatTensor(data_te.toarray()) data_tr = torch.FloatTensor(data_tr.toarray()) yield data_tr, data_te
[docs]class BalancedConditionedDataSampler(ConditionedDataSampler): r"""Sub-sampled version of the :class:`ConditionedDataSampler`. This data sampler is useful when training the :class:`rectorch.models.CMultiVAE` model described in [CVAE]_. During the training, each user must be conditioned over all the possible conditions (actually the ones that the user knows) so the training set must be modified accordingly. This sampler avoids to create all possible user-condition pairs via sub-samplig. The extent of this sub-sampling is defined by the parameter ``subsample``. The prefix 'Balanced' is due to the way the subsampling is performed. Given a user *u*, for each condition *c* only a ``subsample`` fraction of training sample is created for *u* conditioned by *c*. Parameters ---------- iid2cids : :obj:`dict` (key :obj:`int` - value :obj:`list` of :obj:`int`) Dictionary that maps each item to the list of all valid conditions for that item. Items are referred to with the inner id, and conditions with an integer in the range 0, ``n_cond`` -1. n_cond : :obj:`int` Number of possible conditions. sparse_data_tr : :obj:`scipy.sparse.csr_matrix` The training sparse user-item rating matrix. sparse_data_te : :obj:`scipy.sparse.csr_matrix` [optional] The test sparse user-item rating matrix. The shape of this matrix must be the same as ``sparse_data_tr``. By default :obj:`None`. batch_size : :obj:`int` [optional] The size of the batches, by default 1. shuffle : :obj:`bool` [optional] Whether the data set must bu randomly shuffled before creating the batches, by default ``True``. subsample : :obj:`float` [optional] The size of the dimension. It must be a float between (0, 1], by default 0.2. Attributes ---------- iid2cids : :obj:`dict` (key :obj:`int` - value :obj:`list` of :obj:`int`) See ``iid2cids`` parameter. n_cond : :obj:`int` See ``n_cond`` parameter. sparse_data_tr : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_tr`` parameter. sparse_data_te : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_te`` parameter. batch_size : :obj:`int` See ``batch_size`` parameter. shuffle : :obj:`bool` See ``shuffle`` parameter. subsample : :obj:`float` See ``subsample`` parameter. References ---------- .. [CVAE] Tommaso Carraro, Mirko Polato and Fabio Aiolli. Conditioned Variational Autoencoder for top-N item recommendation, 2020. arXiv pre-print: https://arxiv.org/abs/2004.11141 """ def __init__(self, iid2cids, n_cond, sparse_data_tr, sparse_data_te=None, batch_size=1, subsample=.2): super(BalancedConditionedDataSampler, self).__init__(iid2cids, n_cond, sparse_data_tr, sparse_data_te, batch_size) self.subsample = subsample self._compute_sampled_conditions() def _compute_conditions(self): r2cond = {} for i, row in enumerate(self.sparse_data_tr): _, cols = row.nonzero() r2cond[i] = set.union(*[set(self.iid2cids[c]) for c in cols]) self.examples = {-1 : list(r2cond.keys())} for c in range(self.n_cond): self.examples[c] = [] for r in r2cond: if c in r2cond[r]: self.examples[c].append(r) del r2cond self.num_cond_examples = sum([len(self.examples[c]) for c in range(self.n_cond)]) rows = [m for m in self.iid2cids for _ in range(len(self.iid2cids[m]))] cols = [g for m in self.iid2cids for g in self.iid2cids[m]] values = np.ones(len(rows)) self.M = csr_matrix((values, (rows, cols)), shape=(len(self.iid2cids), self.n_cond)) def _compute_sampled_conditions(self): data = [(r, -1) for r in self.examples[-1]] m = int(self.num_cond_examples * self.subsample / self.n_cond) for c in range(self.n_cond): data += [(r, c) for r in np.random.choice(self.examples[c], m)] self.examples = np.array(data) def __len__(self): m = int(self.num_cond_examples * self.subsample) + self.sparse_data_tr.shape[0] return int(np.ceil(m / self.batch_size))
[docs]class EmptyConditionedDataSampler(Sampler): r"""Data sampler that returns unconditioned batches used by the :class:`rectorch.models.CMultiVAE` model. This data sampler is useful when training the :class:`rectorch.models.CMultiVAE` model described in [CVAE]_. This sampler is very similar to :class:`DataSampler` with the expection that the yielded batches have appended a zero matrix of the size ``batch_size`` :math:`\times` ``n_cond``. Parameters ---------- n_cond : :obj:`int` Number of possible conditions. sparse_data_tr : :obj:`scipy.sparse.csr_matrix` The training sparse user-item rating matrix. sparse_data_te : :obj:`scipy.sparse.csr_matrix` [optional] The test sparse user-item rating matrix. The shape of this matrix must be the same as ``sparse_data_tr``. By default :obj:`None`. batch_size : :obj:`int` [optional] The size of the batches, by default 1. shuffle : :obj:`bool` [optional] Whether the data set must bu randomly shuffled before creating the batches, by default ``True``. Attributes ---------- n_cond : :obj:`int` See ``n_cond`` parameter. sparse_data_tr : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_tr`` parameter. sparse_data_te : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_te`` parameter. batch_size : :obj:`int` See ``batch_size`` parameter. shuffle : :obj:`bool` See ``shuffle`` parameter. References ---------- .. [CVAE] Tommaso Carraro, Mirko Polato and Fabio Aiolli. Conditioned Variational Autoencoder for top-N item recommendation, 2020. arXiv pre-print: https://arxiv.org/abs/2004.11141 """ def __init__(self, cond_size, sparse_data_tr, sparse_data_te=None, batch_size=1, shuffle=True): super(EmptyConditionedDataSampler, self).__init__() self.sparse_data_tr = sparse_data_tr self.sparse_data_te = sparse_data_te self.batch_size = batch_size self.cond_size = cond_size self.shuffle = shuffle def __len__(self): return int(np.ceil(self.sparse_data_tr.shape[0] / self.batch_size)) def __iter__(self): n = self.sparse_data_tr.shape[0] idxlist = list(range(n)) if self.shuffle: np.random.shuffle(idxlist) for _, start_idx in enumerate(range(0, n, self.batch_size)): end_idx = min(start_idx + self.batch_size, n) data_tr = self.sparse_data_tr[idxlist[start_idx:end_idx]] cond_matrix = csr_matrix((data_tr.shape[0], self.cond_size)) data_tr = hstack([data_tr, cond_matrix], format="csr") data_tr = torch.FloatTensor(data_tr.toarray()) if self.sparse_data_te is None: self.sparse_data_te = self.sparse_data_tr data_te = self.sparse_data_te[idxlist[start_idx:end_idx]] data_te = torch.FloatTensor(data_te.toarray()) yield data_tr, data_te
[docs]class CFGAN_TrainingSampler(Sampler): r"""Sampler used for training the generator and discriminator of the CFGAN model. The peculiarity of this sampler (see for [CFGAN]_ more details) is that batches are continuously picked at random from all the training set. Parameters ---------- sparse_data_tr : :obj:`scipy.sparse.csr_matrix` The training sparse user-item rating matrix. batch_size : :obj:`int` [optional] The size of the batches, by default 64 Attributes ---------- sparse_data_tr : :obj:`scipy.sparse.csr_matrix` See ``sparse_data_tr`` parameter. batch_size : :obj:`int` See ``batch_size`` parameter. idxlist : :obj:`list` of :obj:`int` Shuffled list of indexes. After an iteration over the sampler, or after a call to the :func:`next` function, the ``idxlist`` contains, in the first ``batch_size`` positions, the indexes of the examples that are contained in the current batch. References ---------- .. [CFGAN] Dong-Kyu Chae, Jin-Soo Kang, Sang-Wook Kim, and Jung-Tae Lee. 2018. CFGAN: A Generic Collaborative Filtering Framework based on Generative Adversarial Networks. In Proceedings of the 27th ACM International Conference on Information and Knowledge Management (CIKM ’18). Association for Computing Machinery, New York, NY, USA, 137–146. DOI: https://doi.org/10.1145/3269206.3271743 """ def __init__(self, sparse_data_tr, batch_size=64): super(CFGAN_TrainingSampler, self).__init__() self.sparse_data_tr = sparse_data_tr self.batch_size = batch_size n = self.sparse_data_tr.shape[0] self.idxlist = list(range(n)) def __len__(self): return int(np.ceil(self.sparse_data_tr.shape[0] / self.batch_size)) def __iter__(self): return self def __next__(self): np.random.shuffle(self.idxlist) data_tr = self.sparse_data_tr[self.idxlist[:self.batch_size]] return torch.FloatTensor(data_tr.toarray())
[docs]class SVAE_Sampler(Sampler): r"""Sampler used for training SVAE. This sampler yields pairs (``x``,``y``) where ``x`` is the tensor of indexes of the positive items, and ``y`` the target tensor with the (multi-hot) ground truth items. This sampler is characterized by batches of size one (a single user at a time). Given a user (batch) *u* the returned ground truth tensor is a 3D tensor of dimension :math:`1 \times |\mathcal{I}_u|-1 \times m`, where :math:`|\mathcal{I}_u|` is the set of rated items by *u*, and *m* the number of items. This tensor represents the ground truth for *u* over time, and each slice of the tensor is a different timestamp across all the possible time unit for this specific user. Parameters ---------- num_items : :obj:`int` Number of items. dict_data_tr : :obj:`dict` (key - :obj:`int`, value - :obj:`list` of :obj:`int`) Dictionary containing the training set. Keys are the users, while the values are the lists of items rated by the users in a specific (often cronological) odrer. dict_data_te : :obj:`dict` (key - :obj:`int`, value - :obj:`list` of :obj:`int`) or :obj:`None` [optional] Dictionary containing the test part of the data set. Keys are the users, while the values are the lists of items rated by the users in a specific (often cronological) odrer, by default :obj:`None`. If :obj:`None` it is not considered in the batch creation, otherwise is used in the construction of the ground truth. Not that ``dict_data_te`` must be valued only in the case of validation/test, i.e., when ``is_training`` is ``False``. pred_type : :obj:`str` in the set {``'next_k'``, ``'next'``, ``'postfix'``} [optional] The variant of loss used by the model, by default ``'next_k'``. If ``'next'`` then only the next item must be predicted, if ``'next_k'`` the next *k* items are considered in the ground truth, otherwise (= ``'postfix'``) all the remaining items are taken as ground truth. k : :obj:`int` [optional] The number of item to predict in the ``'next_k'`` variant, by default 1. This parameter is not considered when ``pred_type`` is not ``'next_k'``. shuffle : :obj:`bool` [optional] Whether the data set must by randomly shuffled before creating the batches, by default ``True``. is_training : :obj:`bool` [optional] Whether the sampler is used during training, by default ``True``. Attributes ---------- See *Parameters* section. """ def __init__(self, num_items, dict_data_tr, dict_data_te=None, pred_type="next_k", k=1, shuffle=True, is_training=True): super(SVAE_Sampler, self).__init__() if pred_type == "next_k": assert k >= 1, "If pred_type == 'next_k' then 'k' must be a positive integer." self.pred_type = pred_type self.dict_data_tr = dict_data_tr self.dict_data_te = dict_data_te self.shuffle = shuffle self.num_items = num_items self.k = k self.is_training = is_training def __len__(self): return len(self.dict_data_tr) def __iter__(self): idxlist = list(range(len(self.dict_data_tr))) if self.shuffle: np.random.shuffle(idxlist) for _, user in enumerate(idxlist): ulen = len(self.dict_data_tr[user]) y_batch_s = torch.zeros(1, ulen - 1, self.num_items) if self.is_training: if self.pred_type == 'next': for timestep in range(ulen - 1): idx = self.dict_data_tr[user][timestep + 1] y_batch_s[0, timestep, idx] = 1. elif self.pred_type == 'next_k': for timestep in range(ulen - 1): idx = self.dict_data_tr[user][timestep + 1:][:self.k] y_batch_s[0, timestep, idx] = 1. elif self.pred_type == 'postfix': for timestep in range(ulen - 1): idx = self.dict_data_tr[user][timestep + 1:] y_batch_s[0, timestep, idx] = 1. else: y_batch_s = torch.zeros(1, 1, self.num_items) y_batch_s[0, 0, self.dict_data_te[user]] = 1. x_batch = [self.dict_data_tr[user][:-1]] #TODO check this x = Variable(torch.LongTensor(x_batch))#.cuda() y = Variable(y_batch_s, requires_grad=False)#.cuda() yield x, y