diff --git a/frontend/js/src/stats/SimilarityScore.tsx b/frontend/js/src/stats/SimilarityScore.tsx
index 2364c3d10f..cc08cab2c2 100644
--- a/frontend/js/src/stats/SimilarityScore.tsx
+++ b/frontend/js/src/stats/SimilarityScore.tsx
@@ -8,9 +8,9 @@ export type SimilarityScoreProps = {
const getclassName = (similarityScore: number): string => {
let className = "";
- if (similarityScore <= 0.3) {
+ if (similarityScore <= 0.15) {
className = "red";
- } else if (similarityScore <= 0.7) {
+ } else if (similarityScore <= 0.3) {
className = "orange";
} else {
className = "purple";
@@ -21,10 +21,9 @@ const getclassName = (similarityScore: number): string => {
function SimilarityScore(props: SimilarityScoreProps) {
const { user, type, similarityScore } = props;
- // We transform the similarity score from a scale 0-1 to 0-10
- const adjustedSimilarityScore = Number((similarityScore * 10).toFixed(1));
+ // We transform the similarity score from a scale 0-1 to 0-100
+ const percentage = Number((similarityScore * 100).toFixed());
const className = getclassName(similarityScore);
- const percentage = adjustedSimilarityScore * 10;
return (
{type === "regular" ? (
- Your compatibility with {user?.name} is {adjustedSimilarityScore}
- /10
+ Your compatibility with {user?.name} is {percentage}%
) : (
-
{adjustedSimilarityScore}/10
+
{percentage}%
)}
);
diff --git a/frontend/js/tests/stats/SimilarityScore.test.tsx b/frontend/js/tests/stats/SimilarityScore.test.tsx
index a95146f834..45e2e227f8 100644
--- a/frontend/js/tests/stats/SimilarityScore.test.tsx
+++ b/frontend/js/tests/stats/SimilarityScore.test.tsx
@@ -30,17 +30,24 @@ describe("SimilarityScore", () => {
it("updates the class name based on similiarty score", async () => {
/* sets class red for score 0.2 */
const wrapper = mount();
- expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual(true);
+ expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual(true);
- /* sets class orange for score 0.5 */
- wrapper.setProps({ similarityScore: 0.57457 });
+ /* sets class orange for score 0.15 */
+ wrapper.setProps({ similarityScore: 0.15 });
+ await waitForComponentToPaint(wrapper);
+ expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual(
+ true
+ );
+
+ /* sets class purple for score 0.3 */
+ wrapper.setProps({ similarityScore: 0.3 });
await waitForComponentToPaint(wrapper);
expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual(
true
);
- /* sets class purple for score 0.9 */
- wrapper.setProps({ similarityScore: 0.945792 });
+ /* sets class purple for score 0.6 */
+ wrapper.setProps({ similarityScore: 0.6 });
await waitForComponentToPaint(wrapper);
expect(wrapper.find(".progress").childAt(0).hasClass("purple")).toEqual(
true
diff --git a/listenbrainz/db/similar_users.py b/listenbrainz/db/similar_users.py
index 197c06088c..bcd1d283f7 100644
--- a/listenbrainz/db/similar_users.py
+++ b/listenbrainz/db/similar_users.py
@@ -42,17 +42,14 @@ def import_user_similarities(data):
values.append((user, orjson.dumps(similar).decode("utf-8")))
user_count += 1
target_user_count += len(similar.keys())
- if len(values) == ROWS_PER_BATCH:
- execute_values(curs, query, values, template=None)
- values = []
- execute_values(curs, query, values, template=None)
+ execute_values(curs, query, values, page_size=ROWS_PER_BATCH, template=None)
+
conn.commit()
except psycopg2.errors.OperationalError as err:
conn.rollback()
- current_app.logger.error(
- "Error: Cannot import user similarites: %s" % str(err))
- return (0, 0.0, "Error: Cannot import user similarites: %s" % str(err))
+ current_app.logger.error("Error: Cannot import user similarites: %s" % str(err))
+ return 0, 0.0, "Error: Cannot import user similarites: %s" % str(err)
# Next lookup user names and insert them into the new similar_users table
try:
@@ -84,9 +81,8 @@ def import_user_similarities(data):
except psycopg2.errors.OperationalError as err:
conn.rollback()
- current_app.logger.error(
- "Error: Cannot correlate user similarity user name: %s" % str(err))
- return (0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err))
+ current_app.logger.error("Error: Cannot correlate user similarity user name: %s" % str(err))
+ return 0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err)
# Finally rotate the table into place
try:
@@ -98,24 +94,21 @@ def import_user_similarities(data):
conn.commit()
except psycopg2.errors.OperationalError as err:
conn.rollback()
- current_app.logger.error(
- "Error: Failed to rotate similar_users table into place: %s" % str(err))
- return (0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err))
+ current_app.logger.error("Error: Failed to rotate similar_users table into place: %s" % str(err))
+ return 0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err)
# Last, delete the old table
try:
with conn.cursor() as curs:
- curs.execute(
- """DROP TABLE recommendation.delete_similar_user CASCADE""")
+ curs.execute("""DROP TABLE recommendation.delete_similar_user CASCADE""")
conn.commit()
except psycopg2.errors.OperationalError as err:
conn.rollback()
- current_app.logger.error(
- "Error: Failed to clean up old similar user table: %s" % str(err))
- return (0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err))
+ current_app.logger.error("Error: Failed to clean up old similar user table: %s" % str(err))
+ return 0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err)
- return (user_count, target_user_count / user_count, "")
+ return user_count, target_user_count / user_count, ""
def get_top_similar_users(count: int = 200):
@@ -130,7 +123,7 @@ def get_top_similar_users(count: int = 200):
result = connection.execute(text("""
SELECT u.musicbrainz_id AS user_name
, ou.musicbrainz_id AS other_user_name
- , value->1 AS similarity -- first element of array is similarity, second is global_similarity
+ , value AS similarity -- first element of array is similarity, second is global_similarity
FROM recommendation.similar_user r
JOIN jsonb_each(r.similar_users) j
ON TRUE
diff --git a/listenbrainz/db/tests/test_similar_users.py b/listenbrainz/db/tests/test_similar_users.py
index 8dbc9ce54a..4cc18bb7a4 100644
--- a/listenbrainz/db/tests/test_similar_users.py
+++ b/listenbrainz/db/tests/test_similar_users.py
@@ -14,8 +14,8 @@ def test_fetch_top_similar_users(self):
user_id_1 = db_user.create(1, "tom")
user_id_2 = db_user.create(2, "jerry")
- similar_users_1 = {user_id_2: [0.42, 0.01]}
- similar_users_2 = {user_id_1: [0.42, 0.02]}
+ similar_users_1 = {user_id_2: 0.42}
+ similar_users_2 = {user_id_1: 0.02}
with db.engine.begin() as connection:
connection.execute(sqlalchemy.text("""
@@ -30,6 +30,6 @@ def test_fetch_top_similar_users(self):
similar_users = get_top_similar_users()
assert len(similar_users) == 1
- assert similar_users[0][0] == 'jerry'
- assert similar_users[0][1] == 'tom'
- assert similar_users[0][2] == "0.020"
+ assert similar_users[0][0] == "jerry"
+ assert similar_users[0][1] == "tom"
+ assert similar_users[0][2] == "0.420"
diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py
index 4b8aa7017a..d30489882f 100644
--- a/listenbrainz/db/tests/test_user.py
+++ b/listenbrainz/db/tests/test_user.py
@@ -164,9 +164,9 @@ def test_get_similar_users(self):
user_id_22 = db_user.create(22, "twenty_two")
user_id_23 = db_user.create(23, "twenty_three")
- similar_users_21 = {str(user_id_22): [0.4, .01], str(user_id_23): [0.7, 0.001]}
- similar_users_22 = {str(user_id_21): [0.4, .01]}
- similar_users_23 = {str(user_id_21): [0.7, .02]}
+ similar_users_21 = {str(user_id_22): 0.4, str(user_id_23): 0.7}
+ similar_users_22 = {str(user_id_21): 0.4}
+ similar_users_23 = {str(user_id_21): 0.7}
similar_users = {
str(user_id_21): similar_users_21,
@@ -230,9 +230,9 @@ def test_search(self):
{
"user_id": searcher_id,
"similar_users": json.dumps({
- str(user_id_c): [0.42, 0.20],
- str(user_id_l): [0.61, 0.25],
- str(user_id_r): [0.87, 0.43]
+ str(user_id_c): 0.42,
+ str(user_id_l): 0.61,
+ str(user_id_r): 0.87
})
}
)
diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py
index 29daeb86c6..198fb903bf 100644
--- a/listenbrainz/db/user.py
+++ b/listenbrainz/db/user.py
@@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]:
result = connection.execute(sqlalchemy.text("""
SELECT musicbrainz_id
, id
- , value->0 AS similarity -- first element of array is local similarity, second is global_similarity
+ , value AS similarity -- first element of array is local similarity, second is global_similarity
FROM recommendation.similar_user r
JOIN jsonb_each(r.similar_users) j
ON TRUE
diff --git a/listenbrainz/tests/integration/test_api.py b/listenbrainz/tests/integration/test_api.py
index 4ef87204be..df9b644e30 100644
--- a/listenbrainz/tests/integration/test_api.py
+++ b/listenbrainz/tests/integration/test_api.py
@@ -666,7 +666,7 @@ def test_similar_users(self):
conn = db.engine.raw_connection()
with conn.cursor() as curs:
- data = {self.user2['id']: (.123, 0.01)}
+ data = {self.user2['id']: 0.123}
curs.execute("""INSERT INTO recommendation.similar_user VALUES (%s, %s)""",
(self.user['id'], json.dumps(data)))
conn.commit()
@@ -676,7 +676,7 @@ def test_similar_users(self):
self.assert200(response)
data = json.loads(response.data)['payload']
self.assertEqual(data[0]['user_name'], self.user2['musicbrainz_id'])
- self.assertEqual(data[0]['similarity'], .123)
+ self.assertEqual(data[0]['similarity'], 0.123)
response = self.client.get(url_for(
'api_v1.get_similar_to_user', user_name=self.user['musicbrainz_id'], other_user_name="muppet"))
diff --git a/listenbrainz/tests/integration/test_feed_api.py b/listenbrainz/tests/integration/test_feed_api.py
index ae0a318987..2d050b758d 100644
--- a/listenbrainz/tests/integration/test_feed_api.py
+++ b/listenbrainz/tests/integration/test_feed_api.py
@@ -59,9 +59,9 @@ def create_and_follow_user(self, user: int, mb_row_id: int, name: str) -> dict:
db_user_relationship.insert(user, following_user['id'], 'follow')
return following_user
- def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, global_similarity: float, name: str) -> dict:
+ def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, name: str) -> dict:
similar_user = db_user.get_or_create(mb_row_id, name)
- self.similar_user_data[similar_user['id']] = (similarity, global_similarity)
+ self.similar_user_data[similar_user['id']] = similarity
with db.engine.begin() as connection:
connection.execute(text("""
INSERT INTO recommendation.similar_user (user_id, similar_users)
@@ -150,8 +150,8 @@ def test_it_sends_all_listens_for_users_that_are_similar(self):
payload = json.load(f)
self.similar_user_data = dict()
- similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 0.1, 'similar_1')
- similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 0.2, 'similar_2')
+ similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 'similar_1')
+ similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 'similar_2')
ts = int(time.time())
# Send 3 listens for the following_user_1
@@ -169,7 +169,7 @@ def test_it_sends_all_listens_for_users_that_are_similar(self):
self.assertEqual(response.json['status'], 'ok')
from datetime import timedelta
- listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW/timedelta(seconds=1))
+ listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW / timedelta(seconds=1))
# Sending a listen with time difference slightly lesser than DEFAULT_LISTEN_EVENT_WINDOW_NEW
payload['payload'][0]['listened_at'] = ts - listenWindowMillisec + 1000
diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py
index 1e7dc3d4c7..55abbf83aa 100644
--- a/listenbrainz_spark/similarity/user.py
+++ b/listenbrainz_spark/similarity/user.py
@@ -39,7 +39,7 @@ def create_messages(similar_users_df: DataFrame) -> dict:
message = {}
for row in itr:
message[row.user_id] = {
- user.other_user_id: (user.similarity, user.global_similarity)
+ user.other_user_id: user.similarity
for user in row.similar_users
}
yield {
@@ -56,65 +56,14 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i
rows, cols = matrix.shape
similar_users = list()
- # Calculate the global similarity scale
- global_max_similarity = None
- global_min_similarity = None
for x in range(rows):
row = []
-
- # Calculate the minimum and maximum values for a user
- for y in range(cols):
-
- # Spark sometimes returns nan values and the way to get rid of them is to
- # cast to a float and discard values that are non a number
- value = float(matrix[x, y])
- if x == y or math.isnan(value):
- continue
-
- if global_max_similarity is None:
- global_max_similarity = value
- global_min_similarity = value
-
- global_max_similarity = max(value, global_max_similarity)
- global_min_similarity = min(value, global_min_similarity)
-
- global_similarity_range = global_max_similarity - global_min_similarity
-
- for x in range(rows):
- row = []
- max_similarity = None
- min_similarity = None
-
- # Calculate the minimum and maximum values for a user
for y in range(cols):
-
- # Spark sometimes returns nan values and the way to get rid of them is to
- # cast to a float and discard values that are non a number
value = float(matrix[x, y])
- if x == y or math.isnan(value):
+ if x == y or math.isnan(value) or value < 0:
continue
-
- if max_similarity is None:
- max_similarity = value
- min_similarity = value
-
- max_similarity = max(value, max_similarity)
- min_similarity = min(value, min_similarity)
-
- if max_similarity is not None and min_similarity is not None:
- # Now apply the scale factor and flatten the results for a user
- similarity_range = max_similarity - min_similarity
- for y in range(cols):
- value = float(matrix[x, y])
- if x == y or math.isnan(value):
- continue
-
- row.append((x,
- y,
- (value - min_similarity) / similarity_range,
- (value - global_min_similarity) / global_similarity_range))
-
- similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users])
+ row.append((x, y, value))
+ similar_users.extend(sorted(row, key=itemgetter(2), reverse=True)[:max_num_users])
return similar_users
@@ -171,11 +120,11 @@ def get_similar_users_df(max_num_users: int):
similar_users_df = listenbrainz_spark.session.createDataFrame(
similar_users,
- ['spark_user_id', 'other_spark_user_id', 'similarity', 'global_similarity']
+ ['spark_user_id', 'other_spark_user_id', 'similarity']
)\
.join(users_df, 'spark_user_id', 'inner')\
.join(other_users_df, 'other_spark_user_id', 'inner')\
- .select('user_id', struct('other_user_id', 'similarity', 'global_similarity').alias('similar_user'))\
+ .select('user_id', struct('other_user_id', 'similarity').alias('similar_user'))\
.groupBy('user_id')\
.agg(collect_list('similar_user').alias('similar_users'))