feat: fetch comments (#48)

cullenwatson · Oct 20, 2024 · d89bed5 · d89bed5
1 parent 5802188
commit d89bed5
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 4 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.2.17"
+version = "0.2.18"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <[email protected]>"]
 readme = "README.md"

diff --git a/staffspy/__init__.py b/staffspy/__init__.py
@@ -138,4 +138,4 @@ def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame:
         comment_dict = [comment.to_dict() for comment in all_comments]
         comment_df = pd.DataFrame(comment_dict)
 
-        return comment_df
+        return comment_df.sort_values(by="created_at", ascending=False)
diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py
@@ -12,15 +12,15 @@ class CommentFetcher:
 
     def __init__(self, session):
         self.session = session
-        self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
+        self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:RELEVANCE,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
         self.post_id = None
         self.num_commments = 100
 
     def fetch_comments(self, post_id: str):
         all_comments = []
         self.post_id = post_id
 
-        for i in range(0, 100_000, self.num_commments):
+        for i in range(0, 200_000, self.num_commments):
             logger.info(f"Fetching comments for post {post_id}, start {i}")
 
             ep = self.endpoint.format(post_id=post_id, start=i)
@@ -61,12 +61,16 @@ def parse_comments(self, comments_json: dict):
             linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None
 
             commentary = element.get("commentary", {}).get("text", "")
+            comment_id = element["urn"].split(",")[-1].rstrip(")")
+            num_likes = element["socialDetail"]["totalSocialActivityCounts"]["numLikes"]
             comment = Comment(
                 post_id=self.post_id,
+                comment_id=comment_id,
                 internal_profile_id=internal_profile_id,
                 public_profile_id=linkedin_id,
                 name=name,
                 text=commentary,
+                num_likes=num_likes,
                 created_at=dt.utcfromtimestamp(element["createdAt"] / 1000),
             )
             comments.append(comment)

diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py
@@ -8,19 +8,23 @@
 
 class Comment(BaseModel):
     post_id: str
+    comment_id: str | None = None
     internal_profile_id: str | None = None
     public_profile_id: str | None = None
     name: str | None = None
     text: str | None = None
+    num_likes: int | None = None
     created_at: dt | None = None
 
     def to_dict(self):
         return {
             "post_id": self.post_id,
+            "comment_id": self.comment_id,
             "internal_profile_id": self.internal_profile_id,
             "public_profile_id": self.public_profile_id,
             "name": self.name,
             "text": self.text,
+            "num_likes": self.num_likes,
             "created_at": self.created_at,
         }