From 543d353b0c5bc7667409927e00027b70630c1ff6 Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@cullenwatson.com>
Date: Sun, 20 Oct 2024 01:53:22 -0500
Subject: [PATCH 1/3] enh: fetch comments on a post

---
 README.md                     | 16 ++++++++
 pyproject.toml                |  2 +-
 staffspy/__init__.py          | 15 +++++++
 staffspy/linkedin/comments.py | 74 +++++++++++++++++++++++++++++++++++
 staffspy/utils/models.py      | 20 ++++++++++
 5 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 staffspy/linkedin/comments.py

diff --git a/README.md b/README.md
index 36ed4e3..ef757a2 100644
--- a/README.md
+++ b/README.md
@@ -54,8 +54,14 @@ staff = account.scrape_staff(
 users = account.scrape_users(
     user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
 )
+
+# fetch all comments on two of Bill Gates' posts 
+comments = account.scrape_comments(
+    ['7252421958540091394','7253083989547048961']
+)
 staff.to_csv("staff.csv", index=False)
 users.to_csv("users.csv", index=False)
+comments.to_csv("comments.csv", index=False)
 ```
 
 #### Browser login
@@ -139,6 +145,16 @@ Optional
 |     e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon
 ```
 
+
+### Parameters for `scrape_comments()`
+
+```plaintext
+├── post_ids (list):
+|    post ids to scrape from
+|     e.g. 7252381444906364929 from https://www.linkedin.com/posts/williamhgates_technology-transformtheeveryday-activity-7252381444906364929-Bkls
+```
+
+
 ### LinkedIn notes
 
     - only 1000 max results per search
diff --git a/pyproject.toml b/pyproject.toml
index 3243392..242b28b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.2.16"
+version = "0.2.17"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <cullen@bunsly.com>"]
 readme = "README.md"
diff --git a/staffspy/__init__.py b/staffspy/__init__.py
index 4dd20ce..94fd40b 100644
--- a/staffspy/__init__.py
+++ b/staffspy/__init__.py
@@ -1,5 +1,6 @@
 import pandas as pd
 
+from staffspy.linkedin.comments import CommentFetcher
 from staffspy.linkedin.linkedin import LinkedInScraper
 from staffspy.utils.models import Staff
 from staffspy.solvers.capsolver import CapSolver
@@ -124,3 +125,17 @@ def scrape_users(self, user_ids: list[str]) -> pd.DataFrame:
         users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
         logger.info(f"Scraped {len(users_df)} users")
         return users_df
+
+    def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame:
+        """Scrape comments from Linkedin by post IDs"""
+        comment_fetcher = CommentFetcher(self.session)
+        all_comments = []
+        for i, post_id in enumerate(post_ids, start=1):
+
+            comments = comment_fetcher.fetch_comments(post_id)
+            all_comments.extend(comments)
+
+        comment_dict = [comment.to_dict() for comment in all_comments]
+        comment_df = pd.DataFrame(comment_dict)
+
+        return comment_df
diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py
new file mode 100644
index 0000000..a792799
--- /dev/null
+++ b/staffspy/linkedin/comments.py
@@ -0,0 +1,74 @@
+import json
+import re
+from datetime import datetime as dt
+
+from staffspy.utils.exceptions import TooManyRequests
+from staffspy.utils.models import Comment
+
+from staffspy.utils.utils import logger
+
+
+class CommentFetcher:
+
+    def __init__(self, session):
+        self.session = session
+        self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
+        self.post_id = None
+        self.num_commments = 100
+
+    def fetch_comments(self, post_id: str):
+        all_comments = []
+        self.post_id = post_id
+
+        for i in range(0, 100_000, self.num_commments):
+            logger.info(f"Fetching comments for post {post_id}, start {i}")
+
+            ep = self.endpoint.format(post_id=post_id, start=i)
+            res = self.session.get(ep)
+            logger.debug(f"comments info, status code - {res.status_code}")
+
+            if res.status_code == 429:
+                return TooManyRequests("429 Too Many Requests")
+            if not res.ok:
+                logger.debug(res.text[:200])
+                return False
+            try:
+                comments_json = res.json()
+            except json.decoder.JSONDecodeError:
+                logger.debug(res.text[:200])
+                return False
+
+            comments, num_results = self.parse_comments(comments_json)
+            all_comments.extend(comments)
+            if not num_results:
+                break
+
+        return all_comments
+
+    def parse_comments(self, comments_json: dict):
+        """Parse the comment data from the employee profile."""
+        comments = []
+        for element in (
+            results := comments_json.get("data", {})
+            .get("socialDashCommentsBySocialDetail", {})
+            .get("elements", [])
+        ):
+            internal_profile_id = (commenter := element["commenter"])[
+                "commenterProfileId"
+            ]
+            name = commenter["title"]["text"]
+            linkedin_id_match = re.search("/in/(.+)", commenter["navigationUrl"])
+            linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None
+
+            commentary = element.get("commentary", {}).get("text", "")
+            comment = Comment(
+                post_id=self.post_id,
+                internal_profile_id=internal_profile_id,
+                public_profile_id=linkedin_id,
+                name=name,
+                text=commentary,
+                created_at=dt.utcfromtimestamp(element["createdAt"] / 1000),
+            )
+            comments.append(comment)
+
+        return comments, len(results)
diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py
index 8482ea8..25061df 100644
--- a/staffspy/utils/models.py
+++ b/staffspy/utils/models.py
@@ -1,10 +1,30 @@
 from datetime import datetime, date
 
 from pydantic import BaseModel
+from datetime import datetime as dt
 
 from staffspy.utils.utils import extract_emails_from_text
 
 
+class Comment(BaseModel):
+    post_id: str
+    internal_profile_id: str | None = None
+    public_profile_id: str | None = None
+    name: str | None = None
+    text: str | None = None
+    created_at: dt | None = None
+
+    def to_dict(self):
+        return {
+            "post_id": self.post_id,
+            "internal_profile_id": self.internal_profile_id,
+            "public_profile_id": self.public_profile_id,
+            "name": self.name,
+            "text": self.text,
+            "created_at": self.created_at,
+        }
+
+
 class School(BaseModel):
     start_date: date | None = None
     end_date: date | None = None

From 5bab2407b6ea31f5001e1c280043ef16c2b230bc Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@cullenwatson.com>
Date: Sun, 20 Oct 2024 02:15:13 -0500
Subject: [PATCH 2/3] fix: pagination

---
 staffspy/__init__.py          | 2 +-
 staffspy/linkedin/comments.py | 8 ++++++--
 staffspy/utils/models.py      | 4 ++++
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/staffspy/__init__.py b/staffspy/__init__.py
index 94fd40b..a3d4620 100644
--- a/staffspy/__init__.py
+++ b/staffspy/__init__.py
@@ -138,4 +138,4 @@ def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame:
         comment_dict = [comment.to_dict() for comment in all_comments]
         comment_df = pd.DataFrame(comment_dict)
 
-        return comment_df
+        return comment_df.sort_values(by="created_at", ascending=False)
diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py
index a792799..c09bb0e 100644
--- a/staffspy/linkedin/comments.py
+++ b/staffspy/linkedin/comments.py
@@ -12,7 +12,7 @@ class CommentFetcher:
 
     def __init__(self, session):
         self.session = session
-        self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
+        self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:RELEVANCE,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
         self.post_id = None
         self.num_commments = 100
 
@@ -20,7 +20,7 @@ def fetch_comments(self, post_id: str):
         all_comments = []
         self.post_id = post_id
 
-        for i in range(0, 100_000, self.num_commments):
+        for i in range(0, 200_000, self.num_commments):
             logger.info(f"Fetching comments for post {post_id}, start {i}")
 
             ep = self.endpoint.format(post_id=post_id, start=i)
@@ -61,12 +61,16 @@ def parse_comments(self, comments_json: dict):
             linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None
 
             commentary = element.get("commentary", {}).get("text", "")
+            comment_id = element["urn"].split(",")[-1].rstrip(")")
+            num_likes = element["socialDetail"]["totalSocialActivityCounts"]["numLikes"]
             comment = Comment(
                 post_id=self.post_id,
+                comment_id=comment_id,
                 internal_profile_id=internal_profile_id,
                 public_profile_id=linkedin_id,
                 name=name,
                 text=commentary,
+                num_likes=num_likes,
                 created_at=dt.utcfromtimestamp(element["createdAt"] / 1000),
             )
             comments.append(comment)
diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py
index 25061df..a682d3e 100644
--- a/staffspy/utils/models.py
+++ b/staffspy/utils/models.py
@@ -8,19 +8,23 @@
 
 class Comment(BaseModel):
     post_id: str
+    comment_id: str | None = None
     internal_profile_id: str | None = None
     public_profile_id: str | None = None
     name: str | None = None
     text: str | None = None
+    num_likes: int | None = None
     created_at: dt | None = None
 
     def to_dict(self):
         return {
             "post_id": self.post_id,
+            "comment_id": self.comment_id,
             "internal_profile_id": self.internal_profile_id,
             "public_profile_id": self.public_profile_id,
             "name": self.name,
             "text": self.text,
+            "num_likes": self.num_likes,
             "created_at": self.created_at,
         }
 

From 9fac3f23ce185caf5b2c109e7ae4eeee295e9ecd Mon Sep 17 00:00:00 2001
From: Cullen Watson <cullen@cullenwatson.com>
Date: Sun, 20 Oct 2024 02:15:43 -0500
Subject: [PATCH 3/3] fix: pagination

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 242b28b..f6afd13 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.2.17"
+version = "0.2.18"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <cullen@bunsly.com>"]
 readme = "README.md"