nikolakopoulos · nikolakopoulos · May 17, 2025
diff --git a/README.md b/README.md
@@ -8,11 +8,20 @@ February 11–15, 2019, Melbourne, VIC, Australia. ACM, New York, NY, USA,
 
 
 ## Example
-We provide an example of both variations of RecWalk discussed in the paper. The code is written in Julia version 0.6 (an updated version that runs in current versions of Julia >= 1.0 is coming soon). 
+We provide an example of both variations of RecWalk discussed in the paper. The repository now includes a Python implementation (`recwalk` package) in addition to the original Julia code. The Python version requires Python 3.8+ along with `numpy` and `scipy`.
 
 For simplicity we also provide a split (target item per user alongside 99 randomly sampled unseen items (yahoo.mat) and a  corresponding item model (example.model). The item model can be built by solving the optimization problems per item described in Section 2.3.1 in the paper. For the example.model we use the SLIM software.    
 
 ## TODOS
-Resolve issues to make the code compatible with Julia 1 (coming soon). 
 Add a notebook with a more thorough example that includes other item models besides SLIM.  
 
+
+### Running the Python example
+
+After installing the required dependencies you can run `example.py` to reproduce
+the results using the Python implementation:
+
+```bash
+pip install -r requirements.txt  # installs numpy and scipy
+python example.py
+```
diff --git a/RecWalkExample.jl b/RecWalkExample.jl
@@ -1,5 +1,12 @@
+using Distributed
 addprocs()
+
+@everywhere using SharedArrays
+@everywhere using LinearAlgebra
+@everywhere using SparseArrays
+
 using MAT
+
 include("include.jl")
 
 TopN = 10
@@ -24,15 +31,15 @@ HR = SharedArray{Float64}(n)
 
 #Base Item Model
 PI = TrainSet*W
-@sync @parallel for user = 1:n
+@sync @distributed for user = 1:n
 	HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(PI[user,:], vcat(Holdout[user], UW[:,user]), TopN)
 end
 println("Base Item Model:  HR = $(mean(HR))  ARHR=$(mean(RR))  NDCG=$(mean(NDCG))")
 
 
 # RecWalk - K-Step
 K = 7
-@sync @parallel for user = 1:n
+@sync @distributed for user = 1:n
     ru  = sparse(reshape(P[user,:], 1, m+n))
     [ru  *= P for step=2:K]
     HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(ru[n+1:end], vcat(Holdout[user], UW[:,user]), TopN)
@@ -43,7 +50,7 @@ println("RecWalk K-Step:   HR = $(mean(HR))  ARHR=$(mean(RR))  NDCG=$(mean(NDCG)
 eta = 0.7
 PI = inv(full(I-eta*P)) # due to the small size of the example data the recwalk ppr vectors can be computed in batch.
 PI = PI[1:n,n+1:end]
-@sync @parallel for user = 1:n
+@sync @distributed for user = 1:n
 	HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(PI[user,:], vcat(Holdout[user], UW[:,user]), TopN)
 end
 println("RecWalk PR:       HR = $(mean(HR))  ARHR=$(mean(RR))  NDCG=$(mean(NDCG))")

diff --git a/example.py b/example.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+import numpy as np
+from scipy.io import loadmat
+from scipy.sparse import csr_matrix, identity
+from scipy.sparse.linalg import inv
+
+from recwalk import RecWalk, read_item_model, single_hr_rr_ndcg
+
+
+def main():
+    TopN = 10
+
+    data = loadmat("yahoo.mat")
+    TrainSet = csr_matrix(data["TrainSet"])
+    Holdout = data["Holdout"]
+    UW = data["SampledUnwatched"]
+    n, m = TrainSet.shape
+
+    W = read_item_model("example.model", m)
+    P = RecWalk(TrainSet, W, alpha=0.005)
+
+    HR = np.zeros(n)
+    RR = np.zeros(n)
+    NDCG = np.zeros(n)
+
+    PI = TrainSet.dot(W)
+    for user in range(n):
+        items = np.concatenate([Holdout[user], UW[:, user]])
+        HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg(
+            PI.getrow(user).toarray().ravel(), items, TopN
+        )
+    print(f"Base Item Model:  HR={HR.mean()}  ARHR={RR.mean()}  NDCG={NDCG.mean()}")
+
+    K = 7
+    for user in range(n):
+        ru = csr_matrix(P.getrow(user))
+        for _ in range(2, K + 1):
+            ru = ru.dot(P)
+        items = np.concatenate([Holdout[user], UW[:, user]])
+        HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg(
+            ru[:, n:].toarray().ravel(), items, TopN
+        )
+    print(f"RecWalk K-Step:   HR={HR.mean()}  ARHR={RR.mean()}  NDCG={NDCG.mean()}")
+
+    eta = 0.7
+    I = identity(P.shape[0], format="csr")
+    PI = inv(I - eta * P).toarray()
+    PI = PI[:n, n:]
+    for user in range(n):
+        items = np.concatenate([Holdout[user], UW[:, user]])
+        HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg(
+            PI[user, :], items, TopN
+        )
+    print(f"RecWalk PR:       HR={HR.mean()}  ARHR={RR.mean()}  NDCG={NDCG.mean()}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/include.jl b/include.jl
@@ -1,6 +1,10 @@
+using SparseArrays
+using LinearAlgebra
+using DelimitedFiles
+
 function RecWalk(TrainSet, ItemModel, α=0.01)
     n,m = size(TrainSet)
-    Muu = speye(n)
+    Muu = spdiagm(0 => ones(n))
     Mii = RowStochastic(ItemModel,"dmax")
     Hui = RowStochastic(TrainSet)
     Hiu = RowStochastic(TrainSet')
@@ -28,10 +32,12 @@ function readItemModel(filename, m)
     A = readdlm(filename, skipblanks = false)
     A[A .== ""] = 0; A = Array{Float64}(A) 
     s1, s2 = size(A)
-    rows = []; cols = []; vals = Float64[];
+    rows = Int[]
+    cols = Int[]
+    vals = Float64[]
     for i = 1:s1
-        items = Array{Int64}(A[i,1:2:s2 - 1]) 
-        indx = find(items .> 0) 
+        items = Array{Int64}(A[i,1:2:s2 - 1])
+        indx = findall(items .> 0)
         items = items[indx]
         scores =  A[i,2 * indx] 
         append!(rows, i * ones(length(items)))

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,18 @@
+[build-system]
+requires = ["setuptools>=61"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "recwalk"
+version = "0.1.0"
+description = "Python port of the RecWalk recommendation algorithm"
+license = {text = "MIT"}
+authors = [ {name="OpenAI Assistant"} ]
+requires-python = ">=3.8"
+dependencies = [
+    "numpy",
+    "scipy",
+]
+
+[project.optional-dependencies]
+# add extras if needed
diff --git a/recwalk/__init__.py b/recwalk/__init__.py
@@ -0,0 +1,8 @@
+from .core import RecWalk, RowStochastic, read_item_model, single_hr_rr_ndcg
+
+__all__ = [
+    'RecWalk',
+    'RowStochastic',
+    'read_item_model',
+    'single_hr_rr_ndcg',
+]
diff --git a/recwalk/core.py b/recwalk/core.py
@@ -0,0 +1,65 @@
+from __future__ import annotations
+import numpy as np
+from scipy.sparse import csr_matrix, vstack, hstack, diags, identity
+from scipy.sparse.linalg import inv
+
+
+def RowStochastic(A: csr_matrix, strategy: str = "standard") -> csr_matrix:
+    if strategy == "dmax":
+        row_sums = np.asarray(A.sum(axis=1)).ravel()
+        dmax = row_sums.max()
+        if dmax == 0:
+            return A.copy()
+        A_temp = A / dmax
+        return identity(A.shape[0], format="csr") - diags(np.asarray(A_temp.sum(axis=1)).ravel()) + A_temp
+    else:
+        row_sums = np.asarray(A.sum(axis=1)).ravel()
+        row_sums[row_sums == 0] = 1
+        return diags(1.0 / row_sums).dot(A)
+
+
+def RecWalk(TrainSet: csr_matrix, ItemModel: csr_matrix, alpha: float = 0.01) -> csr_matrix:
+    n, m = TrainSet.shape
+    Muu = diags(np.ones(n))
+    Mii = RowStochastic(ItemModel, "dmax")
+    Hui = RowStochastic(TrainSet)
+    Hiu = RowStochastic(TrainSet.transpose())
+    H = vstack([hstack([csr_matrix((n, n)), Hui]), hstack([Hiu, csr_matrix((m, m))])])
+    M = vstack([hstack([Muu, csr_matrix((n, m))]), hstack([csr_matrix((m, n)), Mii])])
+    P = alpha * H + (1 - alpha) * M
+    return P
+
+
+def read_item_model(filename: str, m: int) -> csr_matrix:
+    rows = []
+    cols = []
+    vals = []
+    with open(filename, "r") as f:
+        for row_idx, line in enumerate(f, start=1):
+            tokens = line.strip().split()
+            items = [int(tokens[i]) for i in range(0, len(tokens), 2)]
+            scores = [float(tokens[i + 1]) for i in range(0, len(tokens), 2) if i + 1 < len(tokens)]
+            for item, score in zip(items, scores):
+                if item > 0:
+                    rows.append(row_idx)
+                    cols.append(item)
+                    vals.append(score)
+    if rows and (max(rows) < m or max(cols) < m):
+        rows.append(m)
+        cols.append(m)
+        vals.append(0.0)
+    return csr_matrix((vals, (rows, cols)), shape=(m, m))
+
+
+def single_hr_rr_ndcg(pi: np.ndarray, T: np.ndarray, K: int):
+    target = pi[T[0]]
+    pos = np.sum(pi[T] >= target)
+    if 1 <= pos <= K:
+        hr = 1.0
+        rr = 1.0 / pos
+        ndcg = 1.0 / np.log2(pos + 1)
+    else:
+        hr = 0.0
+        rr = 0.0
+        ndcg = 0.0
+    return hr, rr, ndcg
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+numpy
+scipy