diff --git a/pyproject.toml b/pyproject.toml index 242b28b..f6afd13 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.2.17" +version = "0.2.18" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" diff --git a/staffspy/__init__.py b/staffspy/__init__.py index 94fd40b..a3d4620 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -138,4 +138,4 @@ def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame: comment_dict = [comment.to_dict() for comment in all_comments] comment_df = pd.DataFrame(comment_dict) - return comment_df + return comment_df.sort_values(by="created_at", ascending=False) diff --git a/staffspy/linkedin/comments.py b/staffspy/linkedin/comments.py index a792799..c09bb0e 100644 --- a/staffspy/linkedin/comments.py +++ b/staffspy/linkedin/comments.py @@ -12,7 +12,7 @@ class CommentFetcher: def __init__(self, session): self.session = session - self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})" + self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:RELEVANCE,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})" self.post_id = None self.num_commments = 100 @@ -20,7 +20,7 @@ def fetch_comments(self, post_id: str): all_comments = [] self.post_id = post_id - for i in range(0, 100_000, self.num_commments): + for i in range(0, 200_000, self.num_commments): logger.info(f"Fetching comments for post {post_id}, start {i}") ep = self.endpoint.format(post_id=post_id, start=i) @@ -61,12 +61,16 @@ def parse_comments(self, comments_json: dict): linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None commentary = element.get("commentary", {}).get("text", "") + comment_id = element["urn"].split(",")[-1].rstrip(")") + num_likes = element["socialDetail"]["totalSocialActivityCounts"]["numLikes"] comment = Comment( post_id=self.post_id, + comment_id=comment_id, internal_profile_id=internal_profile_id, public_profile_id=linkedin_id, name=name, text=commentary, + num_likes=num_likes, created_at=dt.utcfromtimestamp(element["createdAt"] / 1000), ) comments.append(comment) diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py index 25061df..a682d3e 100644 --- a/staffspy/utils/models.py +++ b/staffspy/utils/models.py @@ -8,19 +8,23 @@ class Comment(BaseModel): post_id: str + comment_id: str | None = None internal_profile_id: str | None = None public_profile_id: str | None = None name: str | None = None text: str | None = None + num_likes: int | None = None created_at: dt | None = None def to_dict(self): return { "post_id": self.post_id, + "comment_id": self.comment_id, "internal_profile_id": self.internal_profile_id, "public_profile_id": self.public_profile_id, "name": self.name, "text": self.text, + "num_likes": self.num_likes, "created_at": self.created_at, }