Skip to content

Commit

Permalink
enh: fetch comments on a post (#47)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Oct 20, 2024
1 parent ebc88fa commit 5802188
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 1 deletion.
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,14 @@ staff = account.scrape_staff(
users = account.scrape_users(
user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
)

# fetch all comments on two of Bill Gates' posts
comments = account.scrape_comments(
['7252421958540091394','7253083989547048961']
)
staff.to_csv("staff.csv", index=False)
users.to_csv("users.csv", index=False)
comments.to_csv("comments.csv", index=False)
```

#### Browser login
Expand Down Expand Up @@ -139,6 +145,16 @@ Optional
| e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon
```


### Parameters for `scrape_comments()`

```plaintext
├── post_ids (list):
| post ids to scrape from
| e.g. 7252381444906364929 from https://www.linkedin.com/posts/williamhgates_technology-transformtheeveryday-activity-7252381444906364929-Bkls
```


### LinkedIn notes

- only 1000 max results per search
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "staffspy"
version = "0.2.16"
version = "0.2.17"
description = "Staff scraper library for LinkedIn"
authors = ["Cullen Watson <[email protected]>"]
readme = "README.md"
Expand Down
15 changes: 15 additions & 0 deletions staffspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd

from staffspy.linkedin.comments import CommentFetcher
from staffspy.linkedin.linkedin import LinkedInScraper
from staffspy.utils.models import Staff
from staffspy.solvers.capsolver import CapSolver
Expand Down Expand Up @@ -124,3 +125,17 @@ def scrape_users(self, user_ids: list[str]) -> pd.DataFrame:
users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(f"Scraped {len(users_df)} users")
return users_df

def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame:
"""Scrape comments from Linkedin by post IDs"""
comment_fetcher = CommentFetcher(self.session)
all_comments = []
for i, post_id in enumerate(post_ids, start=1):

comments = comment_fetcher.fetch_comments(post_id)
all_comments.extend(comments)

comment_dict = [comment.to_dict() for comment in all_comments]
comment_df = pd.DataFrame(comment_dict)

return comment_df
74 changes: 74 additions & 0 deletions staffspy/linkedin/comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import re
from datetime import datetime as dt

from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Comment

from staffspy.utils.utils import logger


class CommentFetcher:

def __init__(self, session):
self.session = session
self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
self.post_id = None
self.num_commments = 100

def fetch_comments(self, post_id: str):
all_comments = []
self.post_id = post_id

for i in range(0, 100_000, self.num_commments):
logger.info(f"Fetching comments for post {post_id}, start {i}")

ep = self.endpoint.format(post_id=post_id, start=i)
res = self.session.get(ep)
logger.debug(f"comments info, status code - {res.status_code}")

if res.status_code == 429:
return TooManyRequests("429 Too Many Requests")
if not res.ok:
logger.debug(res.text[:200])
return False
try:
comments_json = res.json()
except json.decoder.JSONDecodeError:
logger.debug(res.text[:200])
return False

comments, num_results = self.parse_comments(comments_json)
all_comments.extend(comments)
if not num_results:
break

return all_comments

def parse_comments(self, comments_json: dict):
"""Parse the comment data from the employee profile."""
comments = []
for element in (
results := comments_json.get("data", {})
.get("socialDashCommentsBySocialDetail", {})
.get("elements", [])
):
internal_profile_id = (commenter := element["commenter"])[
"commenterProfileId"
]
name = commenter["title"]["text"]
linkedin_id_match = re.search("/in/(.+)", commenter["navigationUrl"])
linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None

commentary = element.get("commentary", {}).get("text", "")
comment = Comment(
post_id=self.post_id,
internal_profile_id=internal_profile_id,
public_profile_id=linkedin_id,
name=name,
text=commentary,
created_at=dt.utcfromtimestamp(element["createdAt"] / 1000),
)
comments.append(comment)

return comments, len(results)
20 changes: 20 additions & 0 deletions staffspy/utils/models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
from datetime import datetime, date

from pydantic import BaseModel
from datetime import datetime as dt

from staffspy.utils.utils import extract_emails_from_text


class Comment(BaseModel):
post_id: str
internal_profile_id: str | None = None
public_profile_id: str | None = None
name: str | None = None
text: str | None = None
created_at: dt | None = None

def to_dict(self):
return {
"post_id": self.post_id,
"internal_profile_id": self.internal_profile_id,
"public_profile_id": self.public_profile_id,
"name": self.name,
"text": self.text,
"created_at": self.created_at,
}


class School(BaseModel):
start_date: date | None = None
end_date: date | None = None
Expand Down

0 comments on commit 5802188

Please sign in to comment.