enh: fetch comments on a post (#47)

cullenwatson · Oct 20, 2024 · 5802188 · 5802188
1 parent ebc88fa
commit 5802188
Show file tree

Hide file tree

Showing 5 changed files with 126 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -54,8 +54,14 @@ staff = account.scrape_staff(
 users = account.scrape_users(
     user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
 )
+
+# fetch all comments on two of Bill Gates' posts 
+comments = account.scrape_comments(
+    ['7252421958540091394','7253083989547048961']
+)
 staff.to_csv("staff.csv", index=False)
 users.to_csv("users.csv", index=False)
+comments.to_csv("comments.csv", index=False)
 ```
 
 #### Browser login
@@ -139,6 +145,16 @@ Optional
 |     e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon
 ```
 
+
+### Parameters for `scrape_comments()`
+
+```plaintext
+├── post_ids (list):
+|    post ids to scrape from
+|     e.g. 7252381444906364929 from https://www.linkedin.com/posts/williamhgates_technology-transformtheeveryday-activity-7252381444906364929-Bkls
+```
+
+
 ### LinkedIn notes
 
     - only 1000 max results per search

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.2.16"
+version = "0.2.17"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <[email protected]>"]
 readme = "README.md"

diff --git a/staffspy/__init__.py b/staffspy/__init__.py
@@ -1,5 +1,6 @@
 import pandas as pd
 
+from staffspy.linkedin.comments import CommentFetcher
 from staffspy.linkedin.linkedin import LinkedInScraper
 from staffspy.utils.models import Staff
 from staffspy.solvers.capsolver import CapSolver
@@ -124,3 +125,17 @@ def scrape_users(self, user_ids: list[str]) -> pd.DataFrame:
         users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
         logger.info(f"Scraped {len(users_df)} users")
         return users_df
+
+    def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame:
+        """Scrape comments from Linkedin by post IDs"""
+        comment_fetcher = CommentFetcher(self.session)
+        all_comments = []
+        for i, post_id in enumerate(post_ids, start=1):
+
+            comments = comment_fetcher.fetch_comments(post_id)
+            all_comments.extend(comments)
+
+        comment_dict = [comment.to_dict() for comment in all_comments]
+        comment_df = pd.DataFrame(comment_dict)
+
+        return comment_df
diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py
@@ -0,0 +1,74 @@
+import json
+import re
+from datetime import datetime as dt
+
+from staffspy.utils.exceptions import TooManyRequests
+from staffspy.utils.models import Comment
+
+from staffspy.utils.utils import logger
+
+
+class CommentFetcher:
+
+    def __init__(self, session):
+        self.session = session
+        self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
+        self.post_id = None
+        self.num_commments = 100
+
+    def fetch_comments(self, post_id: str):
+        all_comments = []
+        self.post_id = post_id
+
+        for i in range(0, 100_000, self.num_commments):
+            logger.info(f"Fetching comments for post {post_id}, start {i}")
+
+            ep = self.endpoint.format(post_id=post_id, start=i)
+            res = self.session.get(ep)
+            logger.debug(f"comments info, status code - {res.status_code}")
+
+            if res.status_code == 429:
+                return TooManyRequests("429 Too Many Requests")
+            if not res.ok:
+                logger.debug(res.text[:200])
+                return False
+            try:
+                comments_json = res.json()
+            except json.decoder.JSONDecodeError:
+                logger.debug(res.text[:200])
+                return False
+
+            comments, num_results = self.parse_comments(comments_json)
+            all_comments.extend(comments)
+            if not num_results:
+                break
+
+        return all_comments
+
+    def parse_comments(self, comments_json: dict):
+        """Parse the comment data from the employee profile."""
+        comments = []
+        for element in (
+            results := comments_json.get("data", {})
+            .get("socialDashCommentsBySocialDetail", {})
+            .get("elements", [])
+        ):
+            internal_profile_id = (commenter := element["commenter"])[
+                "commenterProfileId"
+            ]
+            name = commenter["title"]["text"]
+            linkedin_id_match = re.search("/in/(.+)", commenter["navigationUrl"])
+            linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None
+
+            commentary = element.get("commentary", {}).get("text", "")
+            comment = Comment(
+                post_id=self.post_id,
+                internal_profile_id=internal_profile_id,
+                public_profile_id=linkedin_id,
+                name=name,
+                text=commentary,
+                created_at=dt.utcfromtimestamp(element["createdAt"] / 1000),
+            )
+            comments.append(comment)
+
+        return comments, len(results)
diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py
@@ -1,10 +1,30 @@
 from datetime import datetime, date
 
 from pydantic import BaseModel
+from datetime import datetime as dt
 
 from staffspy.utils.utils import extract_emails_from_text
 
 
+class Comment(BaseModel):
+    post_id: str
+    internal_profile_id: str | None = None
+    public_profile_id: str | None = None
+    name: str | None = None
+    text: str | None = None
+    created_at: dt | None = None
+
+    def to_dict(self):
+        return {
+            "post_id": self.post_id,
+            "internal_profile_id": self.internal_profile_id,
+            "public_profile_id": self.public_profile_id,
+            "name": self.name,
+            "text": self.text,
+            "created_at": self.created_at,
+        }
+
+
 class School(BaseModel):
     start_date: date | None = None
     end_date: date | None = None