From 543d353b0c5bc7667409927e00027b70630c1ff6 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 20 Oct 2024 01:53:22 -0500 Subject: [PATCH 1/3] enh: fetch comments on a post --- README.md | 16 ++++++++ pyproject.toml | 2 +- staffspy/__init__.py | 15 +++++++ staffspy/linkedin/comments.py | 74 +++++++++++++++++++++++++++++++++++ staffspy/utils/models.py | 20 ++++++++++ 5 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 staffspy/linkedin/comments.py diff --git a/README.md b/README.md index 36ed4e3..ef757a2 100644 --- a/README.md +++ b/README.md @@ -54,8 +54,14 @@ staff = account.scrape_staff( users = account.scrape_users( user_ids=['williamhgates', 'rbranson', 'jeffweiner08'] ) + +# fetch all comments on two of Bill Gates' posts +comments = account.scrape_comments( + ['7252421958540091394','7253083989547048961'] +) staff.to_csv("staff.csv", index=False) users.to_csv("users.csv", index=False) +comments.to_csv("comments.csv", index=False) ``` #### Browser login @@ -139,6 +145,16 @@ Optional | e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon ``` + +### Parameters for `scrape_comments()` + +```plaintext +├── post_ids (list): +| post ids to scrape from +| e.g. 7252381444906364929 from https://www.linkedin.com/posts/williamhgates_technology-transformtheeveryday-activity-7252381444906364929-Bkls +``` + + ### LinkedIn notes - only 1000 max results per search diff --git a/pyproject.toml b/pyproject.toml index 3243392..242b28b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.2.16" +version = "0.2.17" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" diff --git a/staffspy/__init__.py b/staffspy/__init__.py index 4dd20ce..94fd40b 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -1,5 +1,6 @@ import pandas as pd +from staffspy.linkedin.comments import CommentFetcher from staffspy.linkedin.linkedin import LinkedInScraper from staffspy.utils.models import Staff from staffspy.solvers.capsolver import CapSolver @@ -124,3 +125,17 @@ def scrape_users(self, user_ids: list[str]) -> pd.DataFrame: users_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) logger.info(f"Scraped {len(users_df)} users") return users_df + + def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame: + """Scrape comments from Linkedin by post IDs""" + comment_fetcher = CommentFetcher(self.session) + all_comments = [] + for i, post_id in enumerate(post_ids, start=1): + + comments = comment_fetcher.fetch_comments(post_id) + all_comments.extend(comments) + + comment_dict = [comment.to_dict() for comment in all_comments] + comment_df = pd.DataFrame(comment_dict) + + return comment_df diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py new file mode 100644 index 0000000..a792799 --- /dev/null +++ b/staffspy/linkedin/comments.py @@ -0,0 +1,74 @@ +import json +import re +from datetime import datetime as dt + +from staffspy.utils.exceptions import TooManyRequests +from staffspy.utils.models import Comment + +from staffspy.utils.utils import logger + + +class CommentFetcher: + + def __init__(self, session): + self.session = session + self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})" + self.post_id = None + self.num_commments = 100 + + def fetch_comments(self, post_id: str): + all_comments = [] + self.post_id = post_id + + for i in range(0, 100_000, self.num_commments): + logger.info(f"Fetching comments for post {post_id}, start {i}") + + ep = self.endpoint.format(post_id=post_id, start=i) + res = self.session.get(ep) + logger.debug(f"comments info, status code - {res.status_code}") + + if res.status_code == 429: + return TooManyRequests("429 Too Many Requests") + if not res.ok: + logger.debug(res.text[:200]) + return False + try: + comments_json = res.json() + except json.decoder.JSONDecodeError: + logger.debug(res.text[:200]) + return False + + comments, num_results = self.parse_comments(comments_json) + all_comments.extend(comments) + if not num_results: + break + + return all_comments + + def parse_comments(self, comments_json: dict): + """Parse the comment data from the employee profile.""" + comments = [] + for element in ( + results := comments_json.get("data", {}) + .get("socialDashCommentsBySocialDetail", {}) + .get("elements", []) + ): + internal_profile_id = (commenter := element["commenter"])[ + "commenterProfileId" + ] + name = commenter["title"]["text"] + linkedin_id_match = re.search("/in/(.+)", commenter["navigationUrl"]) + linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None + + commentary = element.get("commentary", {}).get("text", "") + comment = Comment( + post_id=self.post_id, + internal_profile_id=internal_profile_id, + public_profile_id=linkedin_id, + name=name, + text=commentary, + created_at=dt.utcfromtimestamp(element["createdAt"] / 1000), + ) + comments.append(comment) + + return comments, len(results) diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py index 8482ea8..25061df 100644 --- a/staffspy/utils/models.py +++ b/staffspy/utils/models.py @@ -1,10 +1,30 @@ from datetime import datetime, date from pydantic import BaseModel +from datetime import datetime as dt from staffspy.utils.utils import extract_emails_from_text +class Comment(BaseModel): + post_id: str + internal_profile_id: str | None = None + public_profile_id: str | None = None + name: str | None = None + text: str | None = None + created_at: dt | None = None + + def to_dict(self): + return { + "post_id": self.post_id, + "internal_profile_id": self.internal_profile_id, + "public_profile_id": self.public_profile_id, + "name": self.name, + "text": self.text, + "created_at": self.created_at, + } + + class School(BaseModel): start_date: date | None = None end_date: date | None = None From 5bab2407b6ea31f5001e1c280043ef16c2b230bc Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 20 Oct 2024 02:15:13 -0500 Subject: [PATCH 2/3] fix: pagination --- staffspy/__init__.py | 2 +- staffspy/linkedin/comments.py | 8 ++++++-- staffspy/utils/models.py | 4 ++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/staffspy/__init__.py b/staffspy/__init__.py index 94fd40b..a3d4620 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -138,4 +138,4 @@ def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame: comment_dict = [comment.to_dict() for comment in all_comments] comment_df = pd.DataFrame(comment_dict) - return comment_df + return comment_df.sort_values(by="created_at", ascending=False) diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py index a792799..c09bb0e 100644 --- a/staffspy/linkedin/comments.py +++ b/staffspy/linkedin/comments.py @@ -12,7 +12,7 @@ class CommentFetcher: def __init__(self, session): self.session = session - self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})" + self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:RELEVANCE,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})" self.post_id = None self.num_commments = 100 @@ -20,7 +20,7 @@ def fetch_comments(self, post_id: str): all_comments = [] self.post_id = post_id - for i in range(0, 100_000, self.num_commments): + for i in range(0, 200_000, self.num_commments): logger.info(f"Fetching comments for post {post_id}, start {i}") ep = self.endpoint.format(post_id=post_id, start=i) @@ -61,12 +61,16 @@ def parse_comments(self, comments_json: dict): linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None commentary = element.get("commentary", {}).get("text", "") + comment_id = element["urn"].split(",")[-1].rstrip(")") + num_likes = element["socialDetail"]["totalSocialActivityCounts"]["numLikes"] comment = Comment( post_id=self.post_id, + comment_id=comment_id, internal_profile_id=internal_profile_id, public_profile_id=linkedin_id, name=name, text=commentary, + num_likes=num_likes, created_at=dt.utcfromtimestamp(element["createdAt"] / 1000), ) comments.append(comment) diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py index 25061df..a682d3e 100644 --- a/staffspy/utils/models.py +++ b/staffspy/utils/models.py @@ -8,19 +8,23 @@ class Comment(BaseModel): post_id: str + comment_id: str | None = None internal_profile_id: str | None = None public_profile_id: str | None = None name: str | None = None text: str | None = None + num_likes: int | None = None created_at: dt | None = None def to_dict(self): return { "post_id": self.post_id, + "comment_id": self.comment_id, "internal_profile_id": self.internal_profile_id, "public_profile_id": self.public_profile_id, "name": self.name, "text": self.text, + "num_likes": self.num_likes, "created_at": self.created_at, } From 9fac3f23ce185caf5b2c109e7ae4eeee295e9ecd Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Sun, 20 Oct 2024 02:15:43 -0500 Subject: [PATCH 3/3] fix: pagination --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 242b28b..f6afd13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.2.17" +version = "0.2.18" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md"