Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,20 @@ February 11–15, 2019, Melbourne, VIC, Australia. ACM, New York, NY, USA,


## Example
We provide an example of both variations of RecWalk discussed in the paper. The code is written in Julia version 0.6 (an updated version that runs in current versions of Julia >= 1.0 is coming soon).
We provide an example of both variations of RecWalk discussed in the paper. The repository now includes a Python implementation (`recwalk` package) in addition to the original Julia code. The Python version requires Python 3.8+ along with `numpy` and `scipy`.

For simplicity we also provide a split (target item per user alongside 99 randomly sampled unseen items (yahoo.mat) and a corresponding item model (example.model). The item model can be built by solving the optimization problems per item described in Section 2.3.1 in the paper. For the example.model we use the SLIM software.

## TODOS
Resolve issues to make the code compatible with Julia 1 (coming soon).
Add a notebook with a more thorough example that includes other item models besides SLIM.


### Running the Python example

After installing the required dependencies you can run `example.py` to reproduce
the results using the Python implementation:

```bash
pip install -r requirements.txt # installs numpy and scipy
python example.py
```
13 changes: 10 additions & 3 deletions RecWalkExample.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
using Distributed
addprocs()

@everywhere using SharedArrays
@everywhere using LinearAlgebra
@everywhere using SparseArrays

using MAT

include("include.jl")

TopN = 10
Expand All @@ -24,15 +31,15 @@ HR = SharedArray{Float64}(n)

#Base Item Model
PI = TrainSet*W
@sync @parallel for user = 1:n
@sync @distributed for user = 1:n
HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(PI[user,:], vcat(Holdout[user], UW[:,user]), TopN)
end
println("Base Item Model: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG))")


# RecWalk - K-Step
K = 7
@sync @parallel for user = 1:n
@sync @distributed for user = 1:n
ru = sparse(reshape(P[user,:], 1, m+n))
[ru *= P for step=2:K]
HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(ru[n+1:end], vcat(Holdout[user], UW[:,user]), TopN)
Expand All @@ -43,7 +50,7 @@ println("RecWalk K-Step: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG)
eta = 0.7
PI = inv(full(I-eta*P)) # due to the small size of the example data the recwalk ppr vectors can be computed in batch.
PI = PI[1:n,n+1:end]
@sync @parallel for user = 1:n
@sync @distributed for user = 1:n
HR[user], RR[user], NDCG[user] = Single_HR_RR_NDCG(PI[user,:], vcat(Holdout[user], UW[:,user]), TopN)
end
println("RecWalk PR: HR = $(mean(HR)) ARHR=$(mean(RR)) NDCG=$(mean(NDCG))")
Expand Down
58 changes: 58 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations
import numpy as np
from scipy.io import loadmat
from scipy.sparse import csr_matrix, identity
from scipy.sparse.linalg import inv

from recwalk import RecWalk, read_item_model, single_hr_rr_ndcg


def main():
TopN = 10

data = loadmat("yahoo.mat")
TrainSet = csr_matrix(data["TrainSet"])
Holdout = data["Holdout"]
UW = data["SampledUnwatched"]
n, m = TrainSet.shape

W = read_item_model("example.model", m)
P = RecWalk(TrainSet, W, alpha=0.005)

HR = np.zeros(n)
RR = np.zeros(n)
NDCG = np.zeros(n)

PI = TrainSet.dot(W)
for user in range(n):
items = np.concatenate([Holdout[user], UW[:, user]])
HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg(
PI.getrow(user).toarray().ravel(), items, TopN
)
print(f"Base Item Model: HR={HR.mean()} ARHR={RR.mean()} NDCG={NDCG.mean()}")

K = 7
for user in range(n):
ru = csr_matrix(P.getrow(user))
for _ in range(2, K + 1):
ru = ru.dot(P)
items = np.concatenate([Holdout[user], UW[:, user]])
HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg(
ru[:, n:].toarray().ravel(), items, TopN
)
print(f"RecWalk K-Step: HR={HR.mean()} ARHR={RR.mean()} NDCG={NDCG.mean()}")

eta = 0.7
I = identity(P.shape[0], format="csr")
PI = inv(I - eta * P).toarray()
PI = PI[:n, n:]
for user in range(n):
items = np.concatenate([Holdout[user], UW[:, user]])
HR[user], RR[user], NDCG[user] = single_hr_rr_ndcg(
PI[user, :], items, TopN
)
print(f"RecWalk PR: HR={HR.mean()} ARHR={RR.mean()} NDCG={NDCG.mean()}")


if __name__ == "__main__":
main()
14 changes: 10 additions & 4 deletions include.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
using SparseArrays
using LinearAlgebra
using DelimitedFiles

function RecWalk(TrainSet, ItemModel, α=0.01)
n,m = size(TrainSet)
Muu = speye(n)
Muu = spdiagm(0 => ones(n))
Mii = RowStochastic(ItemModel,"dmax")
Hui = RowStochastic(TrainSet)
Hiu = RowStochastic(TrainSet')
Expand Down Expand Up @@ -28,10 +32,12 @@ function readItemModel(filename, m)
A = readdlm(filename, skipblanks = false)
A[A .== ""] = 0; A = Array{Float64}(A)
s1, s2 = size(A)
rows = []; cols = []; vals = Float64[];
rows = Int[]
cols = Int[]
vals = Float64[]
for i = 1:s1
items = Array{Int64}(A[i,1:2:s2 - 1])
indx = find(items .> 0)
items = Array{Int64}(A[i,1:2:s2 - 1])
indx = findall(items .> 0)
items = items[indx]
scores = A[i,2 * indx]
append!(rows, i * ones(length(items)))
Expand Down
18 changes: 18 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[build-system]
requires = ["setuptools>=61"]
build-backend = "setuptools.build_meta"

[project]
name = "recwalk"
version = "0.1.0"
description = "Python port of the RecWalk recommendation algorithm"
license = {text = "MIT"}
authors = [ {name="OpenAI Assistant"} ]
requires-python = ">=3.8"
dependencies = [
"numpy",
"scipy",
]

[project.optional-dependencies]
# add extras if needed
8 changes: 8 additions & 0 deletions recwalk/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .core import RecWalk, RowStochastic, read_item_model, single_hr_rr_ndcg

__all__ = [
'RecWalk',
'RowStochastic',
'read_item_model',
'single_hr_rr_ndcg',
]
65 changes: 65 additions & 0 deletions recwalk/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from __future__ import annotations
import numpy as np
from scipy.sparse import csr_matrix, vstack, hstack, diags, identity
from scipy.sparse.linalg import inv


def RowStochastic(A: csr_matrix, strategy: str = "standard") -> csr_matrix:
if strategy == "dmax":
row_sums = np.asarray(A.sum(axis=1)).ravel()
dmax = row_sums.max()
if dmax == 0:
return A.copy()
A_temp = A / dmax
return identity(A.shape[0], format="csr") - diags(np.asarray(A_temp.sum(axis=1)).ravel()) + A_temp
else:
row_sums = np.asarray(A.sum(axis=1)).ravel()
row_sums[row_sums == 0] = 1
return diags(1.0 / row_sums).dot(A)


def RecWalk(TrainSet: csr_matrix, ItemModel: csr_matrix, alpha: float = 0.01) -> csr_matrix:
n, m = TrainSet.shape
Muu = diags(np.ones(n))
Mii = RowStochastic(ItemModel, "dmax")
Hui = RowStochastic(TrainSet)
Hiu = RowStochastic(TrainSet.transpose())
H = vstack([hstack([csr_matrix((n, n)), Hui]), hstack([Hiu, csr_matrix((m, m))])])
M = vstack([hstack([Muu, csr_matrix((n, m))]), hstack([csr_matrix((m, n)), Mii])])
P = alpha * H + (1 - alpha) * M
return P


def read_item_model(filename: str, m: int) -> csr_matrix:
rows = []
cols = []
vals = []
with open(filename, "r") as f:
for row_idx, line in enumerate(f, start=1):
tokens = line.strip().split()
items = [int(tokens[i]) for i in range(0, len(tokens), 2)]
scores = [float(tokens[i + 1]) for i in range(0, len(tokens), 2) if i + 1 < len(tokens)]
for item, score in zip(items, scores):
if item > 0:
rows.append(row_idx)
cols.append(item)
vals.append(score)
if rows and (max(rows) < m or max(cols) < m):
rows.append(m)
cols.append(m)
vals.append(0.0)
return csr_matrix((vals, (rows, cols)), shape=(m, m))


def single_hr_rr_ndcg(pi: np.ndarray, T: np.ndarray, K: int):
target = pi[T[0]]
pos = np.sum(pi[T] >= target)
if 1 <= pos <= K:
hr = 1.0
rr = 1.0 / pos
ndcg = 1.0 / np.log2(pos + 1)
else:
hr = 0.0
rr = 0.0
ndcg = 0.0
return hr, rr, ndcg
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
numpy
scipy