diff --git a/README.md b/README.md index 5d3ab3b..e305d4b 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,20 @@ February 11–15, 2019, Melbourne, VIC, Australia. ACM, New York, NY, USA, ## Example -We provide an example of both variations of RecWalk discussed in the paper. The code is written in Julia version 0.6 (an updated version that runs in current versions of Julia >= 1.0 is coming soon). +We provide an example of both variations of RecWalk discussed in the paper. The repository now includes a Python implementation (`recwalk` package) in addition to the original Julia code. The Python version requires Python 3.8+ along with `numpy` and `scipy`. For simplicity we also provide a split (target item per user alongside 99 randomly sampled unseen items (yahoo.mat) and a corresponding item model (example.model). The item model can be built by solving the optimization problems per item described in Section 2.3.1 in the paper. For the example.model we use the SLIM software. ## TODOS -Resolve issues to make the code compatible with Julia 1 (coming soon). Add a notebook with a more thorough example that includes other item models besides SLIM. + +### Running the Python example + +After installing the required dependencies you can run `example.py` to reproduce +the results using the Python implementation: + +```bash +pip install -r requirements.txt # installs numpy and scipy +python example.py +``` diff --git a/RecWalkExample.jl b/RecWalkExample.jl index b51a6f4..3a6e3b2 100644 --- a/RecWalkExample.jl +++ b/RecWalkExample.jl @@ -1,5 +1,12 @@ +using Distributed addprocs() + +@everywhere using SharedArrays +@everywhere using LinearAlgebra +@everywhere using SparseArrays + using MAT + include("include.jl") TopN = 10 @@ -24,7 +31,7 @@ HR = SharedArray{Float64}(n) #Base Item Model PI = TrainSet*W -@sync @parallel for user = 1:n +@sync @distributed for user = 1:n HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(PI[user,:], vcat(Holdout[user], UW[:,user]), TopN) end println("Base Item Model: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG))") @@ -32,7 +39,7 @@ println("Base Item Model: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG) # RecWalk - K-Step K = 7 -@sync @parallel for user = 1:n +@sync @distributed for user = 1:n ru = sparse(reshape(P[user,:], 1, m+n)) [ru *= P for step=2:K] HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(ru[n+1:end], vcat(Holdout[user], UW[:,user]), TopN) @@ -43,7 +50,7 @@ println("RecWalk K-Step: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG) eta = 0.7 PI = inv(full(I-eta*P)) # due to the small size of the example data the recwalk ppr vectors can be computed in batch. PI = PI[1:n,n+1:end] -@sync @parallel for user = 1:n +@sync @distributed for user = 1:n HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(PI[user,:], vcat(Holdout[user], UW[:,user]), TopN) end println("RecWalk PR: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG))") diff --git a/example.py b/example.py new file mode 100644 index 0000000..2f3f701 --- /dev/null +++ b/example.py @@ -0,0 +1,58 @@ +from __future__ import annotations +import numpy as np +from scipy.io import loadmat +from scipy.sparse import csr_matrix, identity +from scipy.sparse.linalg import inv + +from recwalk import RecWalk, read_item_model, single_hr_rr_ndcg + + +def main(): + TopN = 10 + + data = loadmat("yahoo.mat") + TrainSet = csr_matrix(data["TrainSet"]) + Holdout = data["Holdout"] + UW = data["SampledUnwatched"] + n, m = TrainSet.shape + + W = read_item_model("example.model", m) + P = RecWalk(TrainSet, W, alpha=0.005) + + HR = np.zeros(n) + RR = np.zeros(n) + NDCG = np.zeros(n) + + PI = TrainSet.dot(W) + for user in range(n): + items = np.concatenate([Holdout[user], UW[:, user]]) + HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg( + PI.getrow(user).toarray().ravel(), items, TopN + ) + print(f"Base Item Model: HR={HR.mean()} ARHR={RR.mean()} NDCG={NDCG.mean()}") + + K = 7 + for user in range(n): + ru = csr_matrix(P.getrow(user)) + for _ in range(2, K + 1): + ru = ru.dot(P) + items = np.concatenate([Holdout[user], UW[:, user]]) + HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg( + ru[:, n:].toarray().ravel(), items, TopN + ) + print(f"RecWalk K-Step: HR={HR.mean()} ARHR={RR.mean()} NDCG={NDCG.mean()}") + + eta = 0.7 + I = identity(P.shape[0], format="csr") + PI = inv(I - eta * P).toarray() + PI = PI[:n, n:] + for user in range(n): + items = np.concatenate([Holdout[user], UW[:, user]]) + HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg( + PI[user, :], items, TopN + ) + print(f"RecWalk PR: HR={HR.mean()} ARHR={RR.mean()} NDCG={NDCG.mean()}") + + +if __name__ == "__main__": + main() diff --git a/include.jl b/include.jl index 351cb61..428c2d8 100644 --- a/include.jl +++ b/include.jl @@ -1,6 +1,10 @@ +using SparseArrays +using LinearAlgebra +using DelimitedFiles + function RecWalk(TrainSet, ItemModel, α=0.01) n,m = size(TrainSet) - Muu = speye(n) + Muu = spdiagm(0 => ones(n)) Mii = RowStochastic(ItemModel,"dmax") Hui = RowStochastic(TrainSet) Hiu = RowStochastic(TrainSet') @@ -28,10 +32,12 @@ function readItemModel(filename, m) A = readdlm(filename, skipblanks = false) A[A .== ""] = 0; A = Array{Float64}(A) s1, s2 = size(A) - rows = []; cols = []; vals = Float64[]; + rows = Int[] + cols = Int[] + vals = Float64[] for i = 1:s1 - items = Array{Int64}(A[i,1:2:s2 - 1]) - indx = find(items .> 0) + items = Array{Int64}(A[i,1:2:s2 - 1]) + indx = findall(items .> 0) items = items[indx] scores = A[i,2 * indx] append!(rows, i * ones(length(items))) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..59b93f1 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,18 @@ +[build-system] +requires = ["setuptools>=61"] +build-backend = "setuptools.build_meta" + +[project] +name = "recwalk" +version = "0.1.0" +description = "Python port of the RecWalk recommendation algorithm" +license = {text = "MIT"} +authors = [ {name="OpenAI Assistant"} ] +requires-python = ">=3.8" +dependencies = [ + "numpy", + "scipy", +] + +[project.optional-dependencies] +# add extras if needed diff --git a/recwalk/__init__.py b/recwalk/__init__.py new file mode 100644 index 0000000..4243a74 --- /dev/null +++ b/recwalk/__init__.py @@ -0,0 +1,8 @@ +from .core import RecWalk, RowStochastic, read_item_model, single_hr_rr_ndcg + +__all__ = [ + 'RecWalk', + 'RowStochastic', + 'read_item_model', + 'single_hr_rr_ndcg', +] diff --git a/recwalk/core.py b/recwalk/core.py new file mode 100644 index 0000000..58bedbf --- /dev/null +++ b/recwalk/core.py @@ -0,0 +1,65 @@ +from __future__ import annotations +import numpy as np +from scipy.sparse import csr_matrix, vstack, hstack, diags, identity +from scipy.sparse.linalg import inv + + +def RowStochastic(A: csr_matrix, strategy: str = "standard") -> csr_matrix: + if strategy == "dmax": + row_sums = np.asarray(A.sum(axis=1)).ravel() + dmax = row_sums.max() + if dmax == 0: + return A.copy() + A_temp = A / dmax + return identity(A.shape[0], format="csr") - diags(np.asarray(A_temp.sum(axis=1)).ravel()) + A_temp + else: + row_sums = np.asarray(A.sum(axis=1)).ravel() + row_sums[row_sums == 0] = 1 + return diags(1.0 / row_sums).dot(A) + + +def RecWalk(TrainSet: csr_matrix, ItemModel: csr_matrix, alpha: float = 0.01) -> csr_matrix: + n, m = TrainSet.shape + Muu = diags(np.ones(n)) + Mii = RowStochastic(ItemModel, "dmax") + Hui = RowStochastic(TrainSet) + Hiu = RowStochastic(TrainSet.transpose()) + H = vstack([hstack([csr_matrix((n, n)), Hui]), hstack([Hiu, csr_matrix((m, m))])]) + M = vstack([hstack([Muu, csr_matrix((n, m))]), hstack([csr_matrix((m, n)), Mii])]) + P = alpha * H + (1 - alpha) * M + return P + + +def read_item_model(filename: str, m: int) -> csr_matrix: + rows = [] + cols = [] + vals = [] + with open(filename, "r") as f: + for row_idx, line in enumerate(f, start=1): + tokens = line.strip().split() + items = [int(tokens[i]) for i in range(0, len(tokens), 2)] + scores = [float(tokens[i + 1]) for i in range(0, len(tokens), 2) if i + 1 < len(tokens)] + for item, score in zip(items, scores): + if item > 0: + rows.append(row_idx) + cols.append(item) + vals.append(score) + if rows and (max(rows) < m or max(cols) < m): + rows.append(m) + cols.append(m) + vals.append(0.0) + return csr_matrix((vals, (rows, cols)), shape=(m, m)) + + +def single_hr_rr_ndcg(pi: np.ndarray, T: np.ndarray, K: int): + target = pi[T[0]] + pos = np.sum(pi[T] >= target) + if 1 <= pos <= K: + hr = 1.0 + rr = 1.0 / pos + ndcg = 1.0 / np.log2(pos + 1) + else: + hr = 0.0 + rr = 0.0 + ndcg = 0.0 + return hr, rr, ndcg diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..6bad103 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +numpy +scipy