From 47618dc47490282b6095f3b7226e3d0417dabea8 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Tue, 31 Oct 2023 13:51:33 +0100 Subject: [PATCH 01/18] Copy over the core similarity fix into a clean branch for testing purposes. --- listenbrainz/db/similar_users.py | 2 +- listenbrainz_spark/similarity/user.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/listenbrainz/db/similar_users.py b/listenbrainz/db/similar_users.py index 197c06088c..f122452a61 100644 --- a/listenbrainz/db/similar_users.py +++ b/listenbrainz/db/similar_users.py @@ -130,7 +130,7 @@ def get_top_similar_users(count: int = 200): result = connection.execute(text(""" SELECT u.musicbrainz_id AS user_name , ou.musicbrainz_id AS other_user_name - , value->1 AS similarity -- first element of array is similarity, second is global_similarity + , value->0 AS similarity -- first element of array is similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 1e7dc3d4c7..65aea09e1c 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -109,9 +109,15 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i if x == y or math.isnan(value): continue + # scale from [-1, 1] to [0, 1], where closer is more similar + new_sim_value = (value / 2.0) + 0.5 + + # Append to the row, but invert first so that closer is less similar (a percentage) + new_sim_value = 1.0 - new_sim_value + row.append((x, y, - (value - min_similarity) / similarity_range, + new_sim_value, (value - global_min_similarity) / global_similarity_range)) similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users]) From 8e227bafde436c089144d4b9b15b0077dfc6ba4f Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Tue, 31 Oct 2023 15:43:16 +0100 Subject: [PATCH 02/18] Fix test. --- listenbrainz/db/tests/test_similar_users.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/listenbrainz/db/tests/test_similar_users.py b/listenbrainz/db/tests/test_similar_users.py index 8dbc9ce54a..149fa53937 100644 --- a/listenbrainz/db/tests/test_similar_users.py +++ b/listenbrainz/db/tests/test_similar_users.py @@ -32,4 +32,4 @@ def test_fetch_top_similar_users(self): assert len(similar_users) == 1 assert similar_users[0][0] == 'jerry' assert similar_users[0][1] == 'tom' - assert similar_users[0][2] == "0.020" + assert similar_users[0][2] == "0.420" From 1fa33cc0b43a20315e0a765922dda92007853978 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Tue, 31 Oct 2023 20:57:50 +0530 Subject: [PATCH 03/18] Update spark docker image to build pycurl and urllib3 --- docker/Dockerfile.spark.base | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.spark.base b/docker/Dockerfile.spark.base index a9359bf907..3bd10617ee 100644 --- a/docker/Dockerfile.spark.base +++ b/docker/Dockerfile.spark.base @@ -1,4 +1,4 @@ -ARG PYTHON_BASE_IMAGE_VERSION=3.8-20210115 +ARG PYTHON_BASE_IMAGE_VERSION=3.9-focal-20220315 FROM metabrainz/python:$PYTHON_BASE_IMAGE_VERSION ARG PYTHON_BASE_IMAGE_VERSION @@ -16,6 +16,7 @@ RUN apt-get update \ bsdmainutils \ xz-utils \ zip \ + libcurl4-openssl-dev \ && rm -rf /var/lib/apt/lists/* ENV DOCKERIZE_VERSION v0.6.1 From b845116276bce8ed9858643a20af1c36b6718e2b Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 2 Nov 2023 18:18:51 +0100 Subject: [PATCH 04/18] Add the UI portion to this PR --- frontend/js/src/stats/SimilarityScore.tsx | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/frontend/js/src/stats/SimilarityScore.tsx b/frontend/js/src/stats/SimilarityScore.tsx index 2364c3d10f..acc6289919 100644 --- a/frontend/js/src/stats/SimilarityScore.tsx +++ b/frontend/js/src/stats/SimilarityScore.tsx @@ -21,10 +21,9 @@ const getclassName = (similarityScore: number): string => { function SimilarityScore(props: SimilarityScoreProps) { const { user, type, similarityScore } = props; - // We transform the similarity score from a scale 0-1 to 0-10 - const adjustedSimilarityScore = Number((similarityScore * 10).toFixed(1)); + // We transform the similarity score from a scale 0-1 to 0-100 + const percentage = Number((similarityScore * 100).toFixed()); const className = getclassName(similarityScore); - const percentage = adjustedSimilarityScore * 10; return (
{type === "regular" ? (

- Your compatibility with {user?.name} is {adjustedSimilarityScore} - /10 + Your compatibility with {user?.name} is {percentage}%

) : ( -

{adjustedSimilarityScore}/10

+

{percentage}%

)}
); From 55b1ba82fce97a56d972e71d671bbc2407bb1a79 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 2 Nov 2023 20:37:53 +0100 Subject: [PATCH 05/18] Change the color scale to move it down a bit --- frontend/js/src/stats/SimilarityScore.tsx | 4 ++-- listenbrainz/db/user.py | 2 +- requirements.txt | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/frontend/js/src/stats/SimilarityScore.tsx b/frontend/js/src/stats/SimilarityScore.tsx index acc6289919..79ac5510d4 100644 --- a/frontend/js/src/stats/SimilarityScore.tsx +++ b/frontend/js/src/stats/SimilarityScore.tsx @@ -8,9 +8,9 @@ export type SimilarityScoreProps = { const getclassName = (similarityScore: number): string => { let className = ""; - if (similarityScore <= 0.3) { + if (similarityScore <= 0.2) { className = "red"; - } else if (similarityScore <= 0.7) { + } else if (similarityScore <= 0.4) { className = "orange"; } else { className = "purple"; diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index 29daeb86c6..f89392fbc5 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]: result = connection.execute(sqlalchemy.text(""" SELECT musicbrainz_id , id - , value->0 AS similarity -- first element of array is local similarity, second is global_similarity + , value->1 AS similarity -- first element of array is local similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE diff --git a/requirements.txt b/requirements.txt index 2723b901cc..2ec521e0f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -56,3 +56,4 @@ urllib3==1.26.18 orjson==3.8.7 dnspython==2.2.1 pyjwt[crypto]==2.7.0 +packaging~=23.1 From c03a2ce808cbf4e3be4dd9873f8f2df58951a73c Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 2 Nov 2023 20:46:22 +0100 Subject: [PATCH 06/18] Move tests over --- listenbrainz/db/tests/test_user.py | 12 ++++++------ listenbrainz/tests/integration/test_api.py | 2 +- listenbrainz/tests/integration/test_feed_api.py | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py index 4b8aa7017a..d30489882f 100644 --- a/listenbrainz/db/tests/test_user.py +++ b/listenbrainz/db/tests/test_user.py @@ -164,9 +164,9 @@ def test_get_similar_users(self): user_id_22 = db_user.create(22, "twenty_two") user_id_23 = db_user.create(23, "twenty_three") - similar_users_21 = {str(user_id_22): [0.4, .01], str(user_id_23): [0.7, 0.001]} - similar_users_22 = {str(user_id_21): [0.4, .01]} - similar_users_23 = {str(user_id_21): [0.7, .02]} + similar_users_21 = {str(user_id_22): 0.4, str(user_id_23): 0.7} + similar_users_22 = {str(user_id_21): 0.4} + similar_users_23 = {str(user_id_21): 0.7} similar_users = { str(user_id_21): similar_users_21, @@ -230,9 +230,9 @@ def test_search(self): { "user_id": searcher_id, "similar_users": json.dumps({ - str(user_id_c): [0.42, 0.20], - str(user_id_l): [0.61, 0.25], - str(user_id_r): [0.87, 0.43] + str(user_id_c): 0.42, + str(user_id_l): 0.61, + str(user_id_r): 0.87 }) } ) diff --git a/listenbrainz/tests/integration/test_api.py b/listenbrainz/tests/integration/test_api.py index f942e90b8b..8e614f8cdd 100644 --- a/listenbrainz/tests/integration/test_api.py +++ b/listenbrainz/tests/integration/test_api.py @@ -664,7 +664,7 @@ def test_similar_users(self): conn = db.engine.raw_connection() with conn.cursor() as curs: - data = {self.user2['id']: (.123, 0.01)} + data = {self.user2['id']: .123} curs.execute("""INSERT INTO recommendation.similar_user VALUES (%s, %s)""", (self.user['id'], json.dumps(data))) conn.commit() diff --git a/listenbrainz/tests/integration/test_feed_api.py b/listenbrainz/tests/integration/test_feed_api.py index ae0a318987..4eb2d6b60c 100644 --- a/listenbrainz/tests/integration/test_feed_api.py +++ b/listenbrainz/tests/integration/test_feed_api.py @@ -59,9 +59,9 @@ def create_and_follow_user(self, user: int, mb_row_id: int, name: str) -> dict: db_user_relationship.insert(user, following_user['id'], 'follow') return following_user - def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, global_similarity: float, name: str) -> dict: + def create_similar_user(self, similar_to_user: int, mb_row_id: int, similarity: float, name: str) -> dict: similar_user = db_user.get_or_create(mb_row_id, name) - self.similar_user_data[similar_user['id']] = (similarity, global_similarity) + self.similar_user_data[similar_user['id']] = similarity with db.engine.begin() as connection: connection.execute(text(""" INSERT INTO recommendation.similar_user (user_id, similar_users) @@ -150,8 +150,8 @@ def test_it_sends_all_listens_for_users_that_are_similar(self): payload = json.load(f) self.similar_user_data = dict() - similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 0.1, 'similar_1') - similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 0.2, 'similar_2') + similar_user_1 = self.create_similar_user(self.main_user['id'], 104, 0.1, 'similar_1') + similar_user_2 = self.create_similar_user(self.main_user['id'], 105, 0.2, 'similar_2') ts = int(time.time()) # Send 3 listens for the following_user_1 From a6068139cbe1e42df8fe5d050738f3b1030dc8b5 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 2 Nov 2023 20:59:08 +0100 Subject: [PATCH 07/18] Fix coloring tests --- .../js/tests/stats/SimilarityScore.test.tsx | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/frontend/js/tests/stats/SimilarityScore.test.tsx b/frontend/js/tests/stats/SimilarityScore.test.tsx index a95146f834..45e2e227f8 100644 --- a/frontend/js/tests/stats/SimilarityScore.test.tsx +++ b/frontend/js/tests/stats/SimilarityScore.test.tsx @@ -30,17 +30,24 @@ describe("SimilarityScore", () => { it("updates the class name based on similiarty score", async () => { /* sets class red for score 0.2 */ const wrapper = mount(); - expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual(true); + expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual(true); - /* sets class orange for score 0.5 */ - wrapper.setProps({ similarityScore: 0.57457 }); + /* sets class orange for score 0.15 */ + wrapper.setProps({ similarityScore: 0.15 }); + await waitForComponentToPaint(wrapper); + expect(wrapper.find(".progress").childAt(0).hasClass("red")).toEqual( + true + ); + + /* sets class purple for score 0.3 */ + wrapper.setProps({ similarityScore: 0.3 }); await waitForComponentToPaint(wrapper); expect(wrapper.find(".progress").childAt(0).hasClass("orange")).toEqual( true ); - /* sets class purple for score 0.9 */ - wrapper.setProps({ similarityScore: 0.945792 }); + /* sets class purple for score 0.6 */ + wrapper.setProps({ similarityScore: 0.6 }); await waitForComponentToPaint(wrapper); expect(wrapper.find(".progress").childAt(0).hasClass("purple")).toEqual( true From c259405630d10deff2f83453d036693150a0f591 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 9 Nov 2023 16:09:17 +0100 Subject: [PATCH 08/18] Attempt to fix tests --- listenbrainz/db/user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index f89392fbc5..29daeb86c6 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]: result = connection.execute(sqlalchemy.text(""" SELECT musicbrainz_id , id - , value->1 AS similarity -- first element of array is local similarity, second is global_similarity + , value->0 AS similarity -- first element of array is local similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE From 0c6315e6e062e7ad6914c721f1cd3902eb442f2e Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Fri, 10 Nov 2023 18:38:30 +0100 Subject: [PATCH 09/18] Do not invert the cosine result. --- listenbrainz_spark/similarity/user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 65aea09e1c..37257238c2 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -113,7 +113,7 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i new_sim_value = (value / 2.0) + 0.5 # Append to the row, but invert first so that closer is less similar (a percentage) - new_sim_value = 1.0 - new_sim_value + #new_sim_value = 1.0 - new_sim_value row.append((x, y, From 011a87967158c0b2fcb630691b31568ac8af8fbe Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Fri, 10 Nov 2023 19:21:32 +0100 Subject: [PATCH 10/18] Swap the two columns. --- listenbrainz_spark/similarity/user.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 37257238c2..3b3f12ed12 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -117,8 +117,8 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i row.append((x, y, - new_sim_value, - (value - global_min_similarity) / global_similarity_range)) + (value - global_min_similarity) / global_similarity_range, + new_sim_value)) similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users]) From 735a0c6be0354a202fc359208e3aba16351c9a04 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 30 Nov 2023 13:01:34 +0100 Subject: [PATCH 11/18] Lets use the value without inverting, that seems to work well. --- listenbrainz_spark/similarity/user.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 3b3f12ed12..929ef4ec71 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -109,16 +109,13 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i if x == y or math.isnan(value): continue - # scale from [-1, 1] to [0, 1], where closer is more similar + # scale from [-1, 1] to [0, 1] new_sim_value = (value / 2.0) + 0.5 - # Append to the row, but invert first so that closer is less similar (a percentage) - #new_sim_value = 1.0 - new_sim_value - row.append((x, y, - (value - global_min_similarity) / global_similarity_range, - new_sim_value)) + new_sim_value, + (value - global_min_similarity) / global_similarity_range)) similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users]) From 1c3338805ea9b7967959f3f121db099a5d6d80e6 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 30 Nov 2023 14:41:55 +0100 Subject: [PATCH 12/18] Put data in the right places. again. #fuckdyslexia --- listenbrainz_spark/similarity/user.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 929ef4ec71..80ecf45778 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -114,8 +114,8 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i row.append((x, y, - new_sim_value, - (value - global_min_similarity) / global_similarity_range)) + (value - global_min_similarity) / global_similarity_range, + new_sim_value)) similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users]) From 107f2a6725148395c56ff9775e2cbec5b6781927 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 30 Nov 2023 14:17:55 +0000 Subject: [PATCH 13/18] Use the right field. --- listenbrainz/db/user.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index 29daeb86c6..f89392fbc5 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]: result = connection.execute(sqlalchemy.text(""" SELECT musicbrainz_id , id - , value->0 AS similarity -- first element of array is local similarity, second is global_similarity + , value->1 AS similarity -- first element of array is local similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE From fe3d4454bcb870a2e133aa14a052fae945ab9afa Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 30 Nov 2023 14:42:56 +0000 Subject: [PATCH 14/18] Use the similarity value without any changes. --- frontend/js/src/stats/SimilarityScore.tsx | 4 ++-- listenbrainz_spark/similarity/user.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/frontend/js/src/stats/SimilarityScore.tsx b/frontend/js/src/stats/SimilarityScore.tsx index 79ac5510d4..28c5b8a207 100644 --- a/frontend/js/src/stats/SimilarityScore.tsx +++ b/frontend/js/src/stats/SimilarityScore.tsx @@ -8,9 +8,9 @@ export type SimilarityScoreProps = { const getclassName = (similarityScore: number): string => { let className = ""; - if (similarityScore <= 0.2) { + if (similarityScore <= 0.25) { className = "red"; - } else if (similarityScore <= 0.4) { + } else if (similarityScore <= 0.5) { className = "orange"; } else { className = "purple"; diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 80ecf45778..5105f85253 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -110,8 +110,8 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i continue # scale from [-1, 1] to [0, 1] - new_sim_value = (value / 2.0) + 0.5 - +# new_sim_value = (value / 2.0) + 0.5 + new_sim_value = value row.append((x, y, (value - global_min_similarity) / global_similarity_range, From 1f3eee5e373eaf5160205defe29a47d454abe0f7 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Thu, 30 Nov 2023 15:56:08 +0000 Subject: [PATCH 15/18] Change the similarity score range --- frontend/js/src/stats/SimilarityScore.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frontend/js/src/stats/SimilarityScore.tsx b/frontend/js/src/stats/SimilarityScore.tsx index 28c5b8a207..cc08cab2c2 100644 --- a/frontend/js/src/stats/SimilarityScore.tsx +++ b/frontend/js/src/stats/SimilarityScore.tsx @@ -8,9 +8,9 @@ export type SimilarityScoreProps = { const getclassName = (similarityScore: number): string => { let className = ""; - if (similarityScore <= 0.25) { + if (similarityScore <= 0.15) { className = "red"; - } else if (similarityScore <= 0.5) { + } else if (similarityScore <= 0.3) { className = "orange"; } else { className = "purple"; From d5490a3637b652ab38dc379d4e05f9daf55b7139 Mon Sep 17 00:00:00 2001 From: Robert Kaye Date: Fri, 1 Dec 2023 12:11:14 +0100 Subject: [PATCH 16/18] test test fix --- listenbrainz/db/tests/test_user.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py index d30489882f..f732ae6953 100644 --- a/listenbrainz/db/tests/test_user.py +++ b/listenbrainz/db/tests/test_user.py @@ -164,9 +164,9 @@ def test_get_similar_users(self): user_id_22 = db_user.create(22, "twenty_two") user_id_23 = db_user.create(23, "twenty_three") - similar_users_21 = {str(user_id_22): 0.4, str(user_id_23): 0.7} - similar_users_22 = {str(user_id_21): 0.4} - similar_users_23 = {str(user_id_21): 0.7} + similar_users_21 = {str(user_id_22): [0.4, .4], str(user_id_23): [0.7,.07]} + similar_users_22 = {str(user_id_21): [0.4, .4]} + similar_users_23 = {str(user_id_21): [0.7, .07]} similar_users = { str(user_id_21): similar_users_21, From 1790de78ed66713ec6fd3ff2f974319d87f778af Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 6 Dec 2023 01:31:49 +0530 Subject: [PATCH 17/18] do not scale but filter negative values from similarity (cherry picked from commit 933068bf09e8c3849219ff811e1f480459634941) --- listenbrainz_spark/similarity/user.py | 62 ++------------------------- 1 file changed, 4 insertions(+), 58 deletions(-) diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 5105f85253..5d3dbe11ab 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -39,7 +39,7 @@ def create_messages(similar_users_df: DataFrame) -> dict: message = {} for row in itr: message[row.user_id] = { - user.other_user_id: (user.similarity, user.global_similarity) + user.other_user_id: user.similarity for user in row.similar_users } yield { @@ -56,68 +56,14 @@ def threshold_similar_users(matrix: ndarray, max_num_users: int) -> List[Tuple[i rows, cols = matrix.shape similar_users = list() - # Calculate the global similarity scale - global_max_similarity = None - global_min_similarity = None for x in range(rows): row = [] - - # Calculate the minimum and maximum values for a user - for y in range(cols): - - # Spark sometimes returns nan values and the way to get rid of them is to - # cast to a float and discard values that are non a number - value = float(matrix[x, y]) - if x == y or math.isnan(value): - continue - - if global_max_similarity is None: - global_max_similarity = value - global_min_similarity = value - - global_max_similarity = max(value, global_max_similarity) - global_min_similarity = min(value, global_min_similarity) - - global_similarity_range = global_max_similarity - global_min_similarity - - for x in range(rows): - row = [] - max_similarity = None - min_similarity = None - - # Calculate the minimum and maximum values for a user for y in range(cols): - - # Spark sometimes returns nan values and the way to get rid of them is to - # cast to a float and discard values that are non a number value = float(matrix[x, y]) - if x == y or math.isnan(value): + if x == y or math.isnan(value) or value < 0: continue - - if max_similarity is None: - max_similarity = value - min_similarity = value - - max_similarity = max(value, max_similarity) - min_similarity = min(value, min_similarity) - - if max_similarity is not None and min_similarity is not None: - # Now apply the scale factor and flatten the results for a user - similarity_range = max_similarity - min_similarity - for y in range(cols): - value = float(matrix[x, y]) - if x == y or math.isnan(value): - continue - - # scale from [-1, 1] to [0, 1] -# new_sim_value = (value / 2.0) + 0.5 - new_sim_value = value - row.append((x, - y, - (value - global_min_similarity) / global_similarity_range, - new_sim_value)) - - similar_users.extend(sorted(row, key = itemgetter(2), reverse = True)[:max_num_users]) + row.append((x, y, value)) + similar_users.extend(sorted(row, key=itemgetter(2), reverse=True)[:max_num_users]) return similar_users From 6ab892d7458a5d02e26c30b429d406ffb69947e4 Mon Sep 17 00:00:00 2001 From: Kartik Ohri Date: Wed, 6 Dec 2023 02:04:36 +0530 Subject: [PATCH 18/18] keep only one similarity value (cherry picked from commit 6a2f4d1de56b65d97f0f94e1c17a76e0b745f896) --- listenbrainz/db/similar_users.py | 33 ++++++++----------- listenbrainz/db/tests/test_similar_users.py | 8 ++--- listenbrainz/db/tests/test_user.py | 6 ++-- listenbrainz/db/user.py | 2 +- listenbrainz/tests/integration/test_api.py | 4 +-- .../tests/integration/test_feed_api.py | 2 +- listenbrainz_spark/similarity/user.py | 4 +-- 7 files changed, 26 insertions(+), 33 deletions(-) diff --git a/listenbrainz/db/similar_users.py b/listenbrainz/db/similar_users.py index f122452a61..bcd1d283f7 100644 --- a/listenbrainz/db/similar_users.py +++ b/listenbrainz/db/similar_users.py @@ -42,17 +42,14 @@ def import_user_similarities(data): values.append((user, orjson.dumps(similar).decode("utf-8"))) user_count += 1 target_user_count += len(similar.keys()) - if len(values) == ROWS_PER_BATCH: - execute_values(curs, query, values, template=None) - values = [] - execute_values(curs, query, values, template=None) + execute_values(curs, query, values, page_size=ROWS_PER_BATCH, template=None) + conn.commit() except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Cannot import user similarites: %s" % str(err)) - return (0, 0.0, "Error: Cannot import user similarites: %s" % str(err)) + current_app.logger.error("Error: Cannot import user similarites: %s" % str(err)) + return 0, 0.0, "Error: Cannot import user similarites: %s" % str(err) # Next lookup user names and insert them into the new similar_users table try: @@ -84,9 +81,8 @@ def import_user_similarities(data): except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Cannot correlate user similarity user name: %s" % str(err)) - return (0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err)) + current_app.logger.error("Error: Cannot correlate user similarity user name: %s" % str(err)) + return 0, 0.0, "Error: Cannot correlate user similarity user name: %s" % str(err) # Finally rotate the table into place try: @@ -98,24 +94,21 @@ def import_user_similarities(data): conn.commit() except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Failed to rotate similar_users table into place: %s" % str(err)) - return (0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err)) + current_app.logger.error("Error: Failed to rotate similar_users table into place: %s" % str(err)) + return 0, 0.0, "Error: Failed to rotate similar_users table into place: %s" % str(err) # Last, delete the old table try: with conn.cursor() as curs: - curs.execute( - """DROP TABLE recommendation.delete_similar_user CASCADE""") + curs.execute("""DROP TABLE recommendation.delete_similar_user CASCADE""") conn.commit() except psycopg2.errors.OperationalError as err: conn.rollback() - current_app.logger.error( - "Error: Failed to clean up old similar user table: %s" % str(err)) - return (0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err)) + current_app.logger.error("Error: Failed to clean up old similar user table: %s" % str(err)) + return 0, 0.0, "Error: Failed to clean up old similar user table: %s" % str(err) - return (user_count, target_user_count / user_count, "") + return user_count, target_user_count / user_count, "" def get_top_similar_users(count: int = 200): @@ -130,7 +123,7 @@ def get_top_similar_users(count: int = 200): result = connection.execute(text(""" SELECT u.musicbrainz_id AS user_name , ou.musicbrainz_id AS other_user_name - , value->0 AS similarity -- first element of array is similarity, second is global_similarity + , value AS similarity -- first element of array is similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE diff --git a/listenbrainz/db/tests/test_similar_users.py b/listenbrainz/db/tests/test_similar_users.py index 149fa53937..4cc18bb7a4 100644 --- a/listenbrainz/db/tests/test_similar_users.py +++ b/listenbrainz/db/tests/test_similar_users.py @@ -14,8 +14,8 @@ def test_fetch_top_similar_users(self): user_id_1 = db_user.create(1, "tom") user_id_2 = db_user.create(2, "jerry") - similar_users_1 = {user_id_2: [0.42, 0.01]} - similar_users_2 = {user_id_1: [0.42, 0.02]} + similar_users_1 = {user_id_2: 0.42} + similar_users_2 = {user_id_1: 0.02} with db.engine.begin() as connection: connection.execute(sqlalchemy.text(""" @@ -30,6 +30,6 @@ def test_fetch_top_similar_users(self): similar_users = get_top_similar_users() assert len(similar_users) == 1 - assert similar_users[0][0] == 'jerry' - assert similar_users[0][1] == 'tom' + assert similar_users[0][0] == "jerry" + assert similar_users[0][1] == "tom" assert similar_users[0][2] == "0.420" diff --git a/listenbrainz/db/tests/test_user.py b/listenbrainz/db/tests/test_user.py index f732ae6953..d30489882f 100644 --- a/listenbrainz/db/tests/test_user.py +++ b/listenbrainz/db/tests/test_user.py @@ -164,9 +164,9 @@ def test_get_similar_users(self): user_id_22 = db_user.create(22, "twenty_two") user_id_23 = db_user.create(23, "twenty_three") - similar_users_21 = {str(user_id_22): [0.4, .4], str(user_id_23): [0.7,.07]} - similar_users_22 = {str(user_id_21): [0.4, .4]} - similar_users_23 = {str(user_id_21): [0.7, .07]} + similar_users_21 = {str(user_id_22): 0.4, str(user_id_23): 0.7} + similar_users_22 = {str(user_id_21): 0.4} + similar_users_23 = {str(user_id_21): 0.7} similar_users = { str(user_id_21): similar_users_21, diff --git a/listenbrainz/db/user.py b/listenbrainz/db/user.py index f89392fbc5..198fb903bf 100644 --- a/listenbrainz/db/user.py +++ b/listenbrainz/db/user.py @@ -462,7 +462,7 @@ def get_similar_users(user_id: int) -> Optional[list[dict]]: result = connection.execute(sqlalchemy.text(""" SELECT musicbrainz_id , id - , value->1 AS similarity -- first element of array is local similarity, second is global_similarity + , value AS similarity -- first element of array is local similarity, second is global_similarity FROM recommendation.similar_user r JOIN jsonb_each(r.similar_users) j ON TRUE diff --git a/listenbrainz/tests/integration/test_api.py b/listenbrainz/tests/integration/test_api.py index 0c589cff79..df9b644e30 100644 --- a/listenbrainz/tests/integration/test_api.py +++ b/listenbrainz/tests/integration/test_api.py @@ -666,7 +666,7 @@ def test_similar_users(self): conn = db.engine.raw_connection() with conn.cursor() as curs: - data = {self.user2['id']: .123} + data = {self.user2['id']: 0.123} curs.execute("""INSERT INTO recommendation.similar_user VALUES (%s, %s)""", (self.user['id'], json.dumps(data))) conn.commit() @@ -676,7 +676,7 @@ def test_similar_users(self): self.assert200(response) data = json.loads(response.data)['payload'] self.assertEqual(data[0]['user_name'], self.user2['musicbrainz_id']) - self.assertEqual(data[0]['similarity'], .123) + self.assertEqual(data[0]['similarity'], 0.123) response = self.client.get(url_for( 'api_v1.get_similar_to_user', user_name=self.user['musicbrainz_id'], other_user_name="muppet")) diff --git a/listenbrainz/tests/integration/test_feed_api.py b/listenbrainz/tests/integration/test_feed_api.py index 4eb2d6b60c..2d050b758d 100644 --- a/listenbrainz/tests/integration/test_feed_api.py +++ b/listenbrainz/tests/integration/test_feed_api.py @@ -169,7 +169,7 @@ def test_it_sends_all_listens_for_users_that_are_similar(self): self.assertEqual(response.json['status'], 'ok') from datetime import timedelta - listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW/timedelta(seconds=1)) + listenWindowMillisec = int(DEFAULT_LISTEN_EVENT_WINDOW_NEW / timedelta(seconds=1)) # Sending a listen with time difference slightly lesser than DEFAULT_LISTEN_EVENT_WINDOW_NEW payload['payload'][0]['listened_at'] = ts - listenWindowMillisec + 1000 diff --git a/listenbrainz_spark/similarity/user.py b/listenbrainz_spark/similarity/user.py index 5d3dbe11ab..55abbf83aa 100644 --- a/listenbrainz_spark/similarity/user.py +++ b/listenbrainz_spark/similarity/user.py @@ -120,11 +120,11 @@ def get_similar_users_df(max_num_users: int): similar_users_df = listenbrainz_spark.session.createDataFrame( similar_users, - ['spark_user_id', 'other_spark_user_id', 'similarity', 'global_similarity'] + ['spark_user_id', 'other_spark_user_id', 'similarity'] )\ .join(users_df, 'spark_user_id', 'inner')\ .join(other_users_df, 'other_spark_user_id', 'inner')\ - .select('user_id', struct('other_user_id', 'similarity', 'global_similarity').alias('similar_user'))\ + .select('user_id', struct('other_user_id', 'similarity').alias('similar_user'))\ .groupBy('user_id')\ .agg(collect_list('similar_user').alias('similar_users'))