diff --git a/frontend/js/src/stats/SimilarityScore.tsx b/frontend/js/src/stats/SimilarityScore.tsx index 2364c3d10f..cc08cab2c2 100644 --- a/frontend/js/src/stats/SimilarityScore.tsx +++ b/frontend/js/src/stats/SimilarityScore.tsx @@ -8,9 +8,9 @@ export type SimilarityScoreProps = { const getclassName = (similarityScore: number): string => { let className = ""; - if (similarityScore <= 0.3) { + if (similarityScore <= 0.15) { className = "red"; - } else if (similarityScore <= 0.7) { + } else if (similarityScore <= 0.3) { className = "orange"; } else { className = "purple"; @@ -21,10 +21,9 @@ const getclassName = (similarityScore: number): string => { function SimilarityScore(props: SimilarityScoreProps) { const { user, type, similarityScore } = props; - // We transform the similarity score from a scale 0-1 to 0-10 - const adjustedSimilarityScore = Number((similarityScore * 10).toFixed(1)); + // We transform the similarity score from a scale 0-1 to 0-100 + const percentage = Number((similarityScore * 100).toFixed()); const className = getclassName(similarityScore); - const percentage = adjustedSimilarityScore * 10; return (
{type === "regular" ? (

- Your compatibility with {user?.name} is {adjustedSimilarityScore} - /10 + Your compatibility with {user?.name} is {percentage}%

) : ( -

{adjustedSimilarityScore}/10

+

{percentage}%

)}
); diff --git a/frontend/js/tests/stats/SimilarityScore.test.tsx b/frontend/js/tests/stats/SimilarityScore.test.tsx index a95146f834..45e2e227f8 100644 --- a/frontend/js/tests/stats/SimilarityScore.test.tsx +++ b/frontend/js/tests/stats/SimilarityScore.test.tsx @@ -30,17 +30,24 @@ describe("SimilarityScore", () => { it("updates the class name based on similiarty score", async () => { /* sets class red for score 0.2 */ const wrapper = mount(); - expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual(true); + expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual(true); - /* sets class orange for score 0.5 */ - wrapper.setProps({ similarityScore: 0.57457 }); + /* sets class orange for score 0.15 */ + wrapper.setProps({ similarityScore: 0.15 }); + await waitForComponentToPaint(wrapper); + expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual( + true + ); + + /* sets class purple for score 0.3 */ + wrapper.setProps({ similarityScore: 0.3 }); await waitForComponentToPaint(wrapper); expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual( true ); - /* sets class purple for score 0.9 */ - wrapper.setProps({ similarityScore: 0.945792 }); + /* sets class purple for score 0.6 */ + wrapper.setProps({ similarityScore: 0.6 }); await waitForComponentToPaint(wrapper); expect(wrapper.find(".progress").childAt(0).hasClass("purple")).toEqual( true diff --git a/listenbrainz/db/similar_users.py b/listenbrainz/db/similar_users.py index 197c06088c..bcd1d283f7 100644 --- a/listenbrainz/db/similar_users.py +++ b/listenbrainz/db/similar_users.py @@ -42,17 +42,14 @@ def import_user_similarities(data): values.append((user, orjson.dumps(similar).decode("utf-8"))) user_count += 1 target_user_count += len(similar.keys()) - if len(values) == ROWS_PER_BATCH: - execute_values(curs, query, values, template=None) - values = [] - execute_values(curs, query, values, template=None) + execute_values(curs, query, values, page_size=ROWS_PER_BATCH, template=None) + conn.commit() except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Cannot import user similarites: %s" % str(err)) - return (0, 0.0, "Error: Cannot import user similarites: %s" % str(err)) + current_app.logger.error("Error: Cannot import user similarites: %s" % str(err)) + return 0, 0.0, "Error: Cannot import user similarites: %s" % str(err) # Next lookup user names and insert them into the new similar_users table try: @@ -84,9 +81,8 @@ def import_user_similarities(data): except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Cannot correlate user similarity user name: %s" % str(err)) - return (0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err)) + current_app.logger.error("Error: Cannot correlate user similarity user name: %s" % str(err)) + return 0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err) # Finally rotate the table into place try: @@ -98,24 +94,21 @@ def import_user_similarities(data): conn.commit() except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Failed to rotate similar_users table into place: %s" % str(err)) - return (0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err)) + current_app.logger.error("Error: Failed to rotate similar_users table into place: %s" % str(err)) + return 0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err) # Last, delete the old table try: with conn.cursor() as curs: - curs.execute( - """DROP TABLE recommendation.delete_similar_user CASCADE""") + curs.execute("""DROP TABLE recommendation.delete_similar_user CASCADE""") conn.commit() except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Failed to clean up old similar user table: %s" % str(err)) - return (0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err)) + current_app.logger.error("Error: Failed to clean up old similar user table: %s" % str(err)) + return 0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err) - return (user_count, target_user_count / user_count, "") + return user_count, target_user_count / user_count, "" def get_top_similar_users(count: int = 200): @@ -130,7 +123,7 @@ def get_top_similar_users(count: int = 200): result = connection.execute(text(""" SELECT u.musicbrainz_id AS user_name , ou.musicbrainz_id AS other_user_name - , value->1 AS similarity -- first element of array is similarity, second is global_similarity + , value AS similarity -- first element of array is similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE diff --git a/listenbrainz/db/tests/test_similar_users.py b/listenbrainz/db/tests/test_similar_users.py index 8dbc9ce54a..4cc18bb7a4 100644 --- a/listenbrainz/db/tests/test_similar_users.py +++ b/listenbrainz/db/tests/test_similar_users.py @@ -14,8 +14,8 @@ def test_fetch_top_similar_users(self): user_id_1 = db_user.create(1, "tom") user_id_2 = db_user.create(2, "jerry") - similar_users_1 = {user_id_2: [0.42, 0.01]} - similar_users_2 = {user_id_1: [0.42, 0.02]} + similar_users_1 = {user_id_2: 0.42} + similar_users_2 = {user_id_1: 0.02} with db.engine.begin() as connection: connection.execute(sqlalchemy.text(""" @@ -30,6 +30,6 @@ def test_fetch_top_similar_users(self): similar_users = get_top_similar_users() assert len(similar_users) == 1 - assert similar_users[0][0] == 'jerry' - assert similar_users[0][1] == 'tom' - assert similar_users[0][2] == "0.020" + assert similar_users[0][0] == "jerry" + assert similar_users[0][1] == "tom" + assert similar_users[0][2] == "0.420" diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py index 4b8aa7017a..d30489882f 100644 --- a/listenbrainz/db/tests/test_user.py +++ b/listenbrainz/db/tests/test_user.py @@ -164,9 +164,9 @@ def test_get_similar_users(self): user_id_22 = db_user.create(22, "twenty_two") user_id_23 = db_user.create(23, "twenty_three") - similar_users_21 = {str(user_id_22): [0.4, .01], str(user_id_23): [0.7, 0.001]} - similar_users_22 = {str(user_id_21): [0.4, .01]} - similar_users_23 = {str(user_id_21): [0.7, .02]} + similar_users_21 = {str(user_id_22): 0.4, str(user_id_23): 0.7} + similar_users_22 = {str(user_id_21): 0.4} + similar_users_23 = {str(user_id_21): 0.7} similar_users = { str(user_id_21): similar_users_21, @@ -230,9 +230,9 @@ def test_search(self): { "user_id": searcher_id, "similar_users": json.dumps({ - str(user_id_c): [0.42, 0.20], - str(user_id_l): [0.61, 0.25], - str(user_id_r): [0.87, 0.43] + str(user_id_c): 0.42, + str(user_id_l): 0.61, + str(user_id_r): 0.87 }) } ) diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index 29daeb86c6..198fb903bf 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]: result = connection.execute(sqlalchemy.text(""" SELECT musicbrainz_id , id - , value->0 AS similarity -- first element of array is local similarity, second is global_similarity + , value AS similarity -- first element of array is local similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE diff --git a/listenbrainz/tests/integration/test_api.py b/listenbrainz/tests/integration/test_api.py index 4ef87204be..df9b644e30 100644 --- a/listenbrainz/tests/integration/test_api.py +++ b/listenbrainz/tests/integration/test_api.py @@ -666,7 +666,7 @@ def test_similar_users(self): conn = db.engine.raw_connection() with conn.cursor() as curs: - data = {self.user2['id']: (.123, 0.01)} + data = {self.user2['id']: 0.123} curs.execute("""INSERT INTO recommendation.similar_user VALUES (%s, %s)""", (self.user['id'], json.dumps(data))) conn.commit() @@ -676,7 +676,7 @@ def test_similar_users(self): self.assert200(response) data = json.loads(response.data)['payload'] self.assertEqual(data[0]['user_name'], self.user2['musicbrainz_id']) - self.assertEqual(data[0]['similarity'], .123) + self.assertEqual(data[0]['similarity'], 0.123) response = self.client.get(url_for( 'api_v1.get_similar_to_user', user_name=self.user['musicbrainz_id'], other_user_name="muppet")) diff --git a/listenbrainz/tests/integration/test_feed_api.py b/listenbrainz/tests/integration/test_feed_api.py index ae0a318987..2d050b758d 100644 --- a/listenbrainz/tests/integration/test_feed_api.py +++ b/listenbrainz/tests/integration/test_feed_api.py @@ -59,9 +59,9 @@ def create_and_follow_user(self, user: int, mb_row_id: int, name: str) -> dict: db_user_relationship.insert(user, following_user['id'], 'follow') return following_user - def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, global_similarity: float, name: str) -> dict: + def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, name: str) -> dict: similar_user = db_user.get_or_create(mb_row_id, name) - self.similar_user_data[similar_user['id']] = (similarity, global_similarity) + self.similar_user_data[similar_user['id']] = similarity with db.engine.begin() as connection: connection.execute(text(""" INSERT INTO recommendation.similar_user (user_id, similar_users) @@ -150,8 +150,8 @@ def test_it_sends_all_listens_for_users_that_are_similar(self): payload = json.load(f) self.similar_user_data = dict() - similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 0.1, 'similar_1') - similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 0.2, 'similar_2') + similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 'similar_1') + similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 'similar_2') ts = int(time.time()) # Send 3 listens for the following_user_1 @@ -169,7 +169,7 @@ def test_it_sends_all_listens_for_users_that_are_similar(self): self.assertEqual(response.json['status'], 'ok') from datetime import timedelta - listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW/timedelta(seconds=1)) + listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW / timedelta(seconds=1)) # Sending a listen with time difference slightly lesser than DEFAULT_LISTEN_EVENT_WINDOW_NEW payload['payload'][0]['listened_at'] = ts - listenWindowMillisec + 1000 diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 1e7dc3d4c7..55abbf83aa 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -39,7 +39,7 @@ def create_messages(similar_users_df: DataFrame) -> dict: message = {} for row in itr: message[row.user_id] = { - user.other_user_id: (user.similarity, user.global_similarity) + user.other_user_id: user.similarity for user in row.similar_users } yield { @@ -56,65 +56,14 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i rows, cols = matrix.shape similar_users = list() - # Calculate the global similarity scale - global_max_similarity = None - global_min_similarity = None for x in range(rows): row = [] - - # Calculate the minimum and maximum values for a user - for y in range(cols): - - # Spark sometimes returns nan values and the way to get rid of them is to - # cast to a float and discard values that are non a number - value = float(matrix[x, y]) - if x == y or math.isnan(value): - continue - - if global_max_similarity is None: - global_max_similarity = value - global_min_similarity = value - - global_max_similarity = max(value, global_max_similarity) - global_min_similarity = min(value, global_min_similarity) - - global_similarity_range = global_max_similarity - global_min_similarity - - for x in range(rows): - row = [] - max_similarity = None - min_similarity = None - - # Calculate the minimum and maximum values for a user for y in range(cols): - - # Spark sometimes returns nan values and the way to get rid of them is to - # cast to a float and discard values that are non a number value = float(matrix[x, y]) - if x == y or math.isnan(value): + if x == y or math.isnan(value) or value < 0: continue - - if max_similarity is None: - max_similarity = value - min_similarity = value - - max_similarity = max(value, max_similarity) - min_similarity = min(value, min_similarity) - - if max_similarity is not None and min_similarity is not None: - # Now apply the scale factor and flatten the results for a user - similarity_range = max_similarity - min_similarity - for y in range(cols): - value = float(matrix[x, y]) - if x == y or math.isnan(value): - continue - - row.append((x, - y, - (value - min_similarity) / similarity_range, - (value - global_min_similarity) / global_similarity_range)) - - similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users]) + row.append((x, y, value)) + similar_users.extend(sorted(row, key=itemgetter(2), reverse=True)[:max_num_users]) return similar_users @@ -171,11 +120,11 @@ def get_similar_users_df(max_num_users: int): similar_users_df = listenbrainz_spark.session.createDataFrame( similar_users, - ['spark_user_id', 'other_spark_user_id', 'similarity', 'global_similarity'] + ['spark_user_id', 'other_spark_user_id', 'similarity'] )\ .join(users_df, 'spark_user_id', 'inner')\ .join(other_users_df, 'other_spark_user_id', 'inner')\ - .select('user_id', struct('other_user_id', 'similarity', 'global_similarity').alias('similar_user'))\ + .select('user_id', struct('other_user_id', 'similarity').alias('similar_user'))\ .groupBy('user_id')\ .agg(collect_list('similar_user').alias('similar_users'))