Skip to content

Commit

Permalink
Merge pull request #2615 from metabrainz/test-similarity-fix
Browse files Browse the repository at this point in the history
Improve user similarity
  • Loading branch information
amCap1712 authored Dec 5, 2023
2 parents 1d9b2e7 + 6ab892d commit d842dc5
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 109 deletions.
14 changes: 6 additions & 8 deletions frontend/js/src/stats/SimilarityScore.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ export type SimilarityScoreProps = {

const getclassName = (similarityScore: number): string => {
let className = "";
if (similarityScore <= 0.3) {
if (similarityScore <= 0.15) {
className = "red";
} else if (similarityScore <= 0.7) {
} else if (similarityScore <= 0.3) {
className = "orange";
} else {
className = "purple";
Expand All @@ -21,10 +21,9 @@ const getclassName = (similarityScore: number): string => {
function SimilarityScore(props: SimilarityScoreProps) {
const { user, type, similarityScore } = props;

// We transform the similarity score from a scale 0-1 to 0-10
const adjustedSimilarityScore = Number((similarityScore * 10).toFixed(1));
// We transform the similarity score from a scale 0-1 to 0-100
const percentage = Number((similarityScore * 100).toFixed());
const className = getclassName(similarityScore);
const percentage = adjustedSimilarityScore * 10;

return (
<div
Expand All @@ -49,11 +48,10 @@ function SimilarityScore(props: SimilarityScoreProps) {
</div>
{type === "regular" ? (
<p className="text-muted">
Your compatibility with {user?.name} is {adjustedSimilarityScore}
/10
Your compatibility with {user?.name} is {percentage}%
</p>
) : (
<p className="small text-muted">{adjustedSimilarityScore}/10</p>
<p className="small text-muted">{percentage}%</p>
)}
</div>
);
Expand Down
17 changes: 12 additions & 5 deletions frontend/js/tests/stats/SimilarityScore.test.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,24 @@ describe("SimilarityScore", () => {
it("updates the class name based on similiarty score", async () => {
/* sets class red for score 0.2 */
const wrapper = mount<SimilarityScoreProps>(<SimilarityScore {...props} />);
expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual(true);
expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual(true);

/* sets class orange for score 0.5 */
wrapper.setProps({ similarityScore: 0.57457 });
/* sets class orange for score 0.15 */
wrapper.setProps({ similarityScore: 0.15 });
await waitForComponentToPaint(wrapper);
expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual(
true
);

/* sets class purple for score 0.3 */
wrapper.setProps({ similarityScore: 0.3 });
await waitForComponentToPaint(wrapper);
expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual(
true
);

/* sets class purple for score 0.9 */
wrapper.setProps({ similarityScore: 0.945792 });
/* sets class purple for score 0.6 */
wrapper.setProps({ similarityScore: 0.6 });
await waitForComponentToPaint(wrapper);
expect(wrapper.find(".progress").childAt(0).hasClass("purple")).toEqual(
true
Expand Down
33 changes: 13 additions & 20 deletions listenbrainz/db/similar_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,14 @@ def import_user_similarities(data):
values.append((user, orjson.dumps(similar).decode("utf-8")))
user_count += 1
target_user_count += len(similar.keys())
if len(values) == ROWS_PER_BATCH:
execute_values(curs, query, values, template=None)
values = []
execute_values(curs, query, values, template=None)
execute_values(curs, query, values, page_size=ROWS_PER_BATCH, template=None)

conn.commit()

except psycopg2.errors.OperationalError as err:
conn.rollback()
current_app.logger.error(
"Error: Cannot import user similarites: %s" % str(err))
return (0, 0.0, "Error: Cannot import user similarites: %s" % str(err))
current_app.logger.error("Error: Cannot import user similarites: %s" % str(err))
return 0, 0.0, "Error: Cannot import user similarites: %s" % str(err)

# Next lookup user names and insert them into the new similar_users table
try:
Expand Down Expand Up @@ -84,9 +81,8 @@ def import_user_similarities(data):

except psycopg2.errors.OperationalError as err:
conn.rollback()
current_app.logger.error(
"Error: Cannot correlate user similarity user name: %s" % str(err))
return (0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err))
current_app.logger.error("Error: Cannot correlate user similarity user name: %s" % str(err))
return 0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err)

# Finally rotate the table into place
try:
Expand All @@ -98,24 +94,21 @@ def import_user_similarities(data):
conn.commit()
except psycopg2.errors.OperationalError as err:
conn.rollback()
current_app.logger.error(
"Error: Failed to rotate similar_users table into place: %s" % str(err))
return (0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err))
current_app.logger.error("Error: Failed to rotate similar_users table into place: %s" % str(err))
return 0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err)

# Last, delete the old table
try:
with conn.cursor() as curs:
curs.execute(
"""DROP TABLE recommendation.delete_similar_user CASCADE""")
curs.execute("""DROP TABLE recommendation.delete_similar_user CASCADE""")
conn.commit()

except psycopg2.errors.OperationalError as err:
conn.rollback()
current_app.logger.error(
"Error: Failed to clean up old similar user table: %s" % str(err))
return (0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err))
current_app.logger.error("Error: Failed to clean up old similar user table: %s" % str(err))
return 0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err)

return (user_count, target_user_count / user_count, "")
return user_count, target_user_count / user_count, ""


def get_top_similar_users(count: int = 200):
Expand All @@ -130,7 +123,7 @@ def get_top_similar_users(count: int = 200):
result = connection.execute(text("""
SELECT u.musicbrainz_id AS user_name
, ou.musicbrainz_id AS other_user_name
, value->1 AS similarity -- first element of array is similarity, second is global_similarity
, value AS similarity -- first element of array is similarity, second is global_similarity
FROM recommendation.similar_user r
JOIN jsonb_each(r.similar_users) j
ON TRUE
Expand Down
10 changes: 5 additions & 5 deletions listenbrainz/db/tests/test_similar_users.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def test_fetch_top_similar_users(self):
user_id_1 = db_user.create(1, "tom")
user_id_2 = db_user.create(2, "jerry")

similar_users_1 = {user_id_2: [0.42, 0.01]}
similar_users_2 = {user_id_1: [0.42, 0.02]}
similar_users_1 = {user_id_2: 0.42}
similar_users_2 = {user_id_1: 0.02}

with db.engine.begin() as connection:
connection.execute(sqlalchemy.text("""
Expand All @@ -30,6 +30,6 @@ def test_fetch_top_similar_users(self):

similar_users = get_top_similar_users()
assert len(similar_users) == 1
assert similar_users[0][0] == 'jerry'
assert similar_users[0][1] == 'tom'
assert similar_users[0][2] == "0.020"
assert similar_users[0][0] == "jerry"
assert similar_users[0][1] == "tom"
assert similar_users[0][2] == "0.420"
12 changes: 6 additions & 6 deletions listenbrainz/db/tests/test_user.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ def test_get_similar_users(self):
user_id_22 = db_user.create(22, "twenty_two")
user_id_23 = db_user.create(23, "twenty_three")

similar_users_21 = {str(user_id_22): [0.4, .01], str(user_id_23): [0.7, 0.001]}
similar_users_22 = {str(user_id_21): [0.4, .01]}
similar_users_23 = {str(user_id_21): [0.7, .02]}
similar_users_21 = {str(user_id_22): 0.4, str(user_id_23): 0.7}
similar_users_22 = {str(user_id_21): 0.4}
similar_users_23 = {str(user_id_21): 0.7}

similar_users = {
str(user_id_21): similar_users_21,
Expand Down Expand Up @@ -230,9 +230,9 @@ def test_search(self):
{
"user_id": searcher_id,
"similar_users": json.dumps({
str(user_id_c): [0.42, 0.20],
str(user_id_l): [0.61, 0.25],
str(user_id_r): [0.87, 0.43]
str(user_id_c): 0.42,
str(user_id_l): 0.61,
str(user_id_r): 0.87
})
}
)
Expand Down
2 changes: 1 addition & 1 deletion listenbrainz/db/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]:
result = connection.execute(sqlalchemy.text("""
SELECT musicbrainz_id
, id
, value->0 AS similarity -- first element of array is local similarity, second is global_similarity
, value AS similarity -- first element of array is local similarity, second is global_similarity
FROM recommendation.similar_user r
JOIN jsonb_each(r.similar_users) j
ON TRUE
Expand Down
4 changes: 2 additions & 2 deletions listenbrainz/tests/integration/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ def test_similar_users(self):

conn = db.engine.raw_connection()
with conn.cursor() as curs:
data = {self.user2['id']: (.123, 0.01)}
data = {self.user2['id']: 0.123}
curs.execute("""INSERT INTO recommendation.similar_user VALUES (%s, %s)""",
(self.user['id'], json.dumps(data)))
conn.commit()
Expand All @@ -676,7 +676,7 @@ def test_similar_users(self):
self.assert200(response)
data = json.loads(response.data)['payload']
self.assertEqual(data[0]['user_name'], self.user2['musicbrainz_id'])
self.assertEqual(data[0]['similarity'], .123)
self.assertEqual(data[0]['similarity'], 0.123)

response = self.client.get(url_for(
'api_v1.get_similar_to_user', user_name=self.user['musicbrainz_id'], other_user_name="muppet"))
Expand Down
10 changes: 5 additions & 5 deletions listenbrainz/tests/integration/test_feed_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ def create_and_follow_user(self, user: int, mb_row_id: int, name: str) -> dict:
db_user_relationship.insert(user, following_user['id'], 'follow')
return following_user

def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, global_similarity: float, name: str) -> dict:
def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, name: str) -> dict:
similar_user = db_user.get_or_create(mb_row_id, name)
self.similar_user_data[similar_user['id']] = (similarity, global_similarity)
self.similar_user_data[similar_user['id']] = similarity
with db.engine.begin() as connection:
connection.execute(text("""
INSERT INTO recommendation.similar_user (user_id, similar_users)
Expand Down Expand Up @@ -150,8 +150,8 @@ def test_it_sends_all_listens_for_users_that_are_similar(self):
payload = json.load(f)

self.similar_user_data = dict()
similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 0.1, 'similar_1')
similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 0.2, 'similar_2')
similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 'similar_1')
similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 'similar_2')

ts = int(time.time())
# Send 3 listens for the following_user_1
Expand All @@ -169,7 +169,7 @@ def test_it_sends_all_listens_for_users_that_are_similar(self):
self.assertEqual(response.json['status'], 'ok')

from datetime import timedelta
listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW/timedelta(seconds=1))
listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW / timedelta(seconds=1))

# Sending a listen with time difference slightly lesser than DEFAULT_LISTEN_EVENT_WINDOW_NEW
payload['payload'][0]['listened_at'] = ts - listenWindowMillisec + 1000
Expand Down
63 changes: 6 additions & 57 deletions listenbrainz_spark/similarity/user.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def create_messages(similar_users_df: DataFrame) -> dict:
message = {}
for row in itr:
message[row.user_id] = {
user.other_user_id: (user.similarity, user.global_similarity)
user.other_user_id: user.similarity
for user in row.similar_users
}
yield {
Expand All @@ -56,65 +56,14 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i
rows, cols = matrix.shape
similar_users = list()

# Calculate the global similarity scale
global_max_similarity = None
global_min_similarity = None
for x in range(rows):
row = []

# Calculate the minimum and maximum values for a user
for y in range(cols):

# Spark sometimes returns nan values and the way to get rid of them is to
# cast to a float and discard values that are non a number
value = float(matrix[x, y])
if x == y or math.isnan(value):
continue

if global_max_similarity is None:
global_max_similarity = value
global_min_similarity = value

global_max_similarity = max(value, global_max_similarity)
global_min_similarity = min(value, global_min_similarity)

global_similarity_range = global_max_similarity - global_min_similarity

for x in range(rows):
row = []
max_similarity = None
min_similarity = None

# Calculate the minimum and maximum values for a user
for y in range(cols):

# Spark sometimes returns nan values and the way to get rid of them is to
# cast to a float and discard values that are non a number
value = float(matrix[x, y])
if x == y or math.isnan(value):
if x == y or math.isnan(value) or value < 0:
continue

if max_similarity is None:
max_similarity = value
min_similarity = value

max_similarity = max(value, max_similarity)
min_similarity = min(value, min_similarity)

if max_similarity is not None and min_similarity is not None:
# Now apply the scale factor and flatten the results for a user
similarity_range = max_similarity - min_similarity
for y in range(cols):
value = float(matrix[x, y])
if x == y or math.isnan(value):
continue

row.append((x,
y,
(value - min_similarity) / similarity_range,
(value - global_min_similarity) / global_similarity_range))

similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users])
row.append((x, y, value))
similar_users.extend(sorted(row, key=itemgetter(2), reverse=True)[:max_num_users])

return similar_users

Expand Down Expand Up @@ -171,11 +120,11 @@ def get_similar_users_df(max_num_users: int):

similar_users_df = listenbrainz_spark.session.createDataFrame(
similar_users,
['spark_user_id', 'other_spark_user_id', 'similarity', 'global_similarity']
['spark_user_id', 'other_spark_user_id', 'similarity']
)\
.join(users_df, 'spark_user_id', 'inner')\
.join(other_users_df, 'other_spark_user_id', 'inner')\
.select('user_id', struct('other_user_id', 'similarity', 'global_similarity').alias('similar_user'))\
.select('user_id', struct('other_user_id', 'similarity').alias('similar_user'))\
.groupBy('user_id')\
.agg(collect_list('similar_user').alias('similar_users'))

Expand Down

0 comments on commit d842dc5

Please sign in to comment.