Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enh: fetch comments on a post #47

Merged
merged 1 commit into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,14 @@ staff = account.scrape_staff(
users = account.scrape_users(
user_ids=['williamhgates', 'rbranson', 'jeffweiner08']
)

# fetch all comments on two of Bill Gates' posts
comments = account.scrape_comments(
['7252421958540091394','7253083989547048961']
)
staff.to_csv("staff.csv", index=False)
users.to_csv("users.csv", index=False)
comments.to_csv("comments.csv", index=False)
```

#### Browser login
Expand Down Expand Up @@ -139,6 +145,16 @@ Optional
| e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon
```


### Parameters for `scrape_comments()`

```plaintext
├── post_ids (list):
| post ids to scrape from
| e.g. 7252381444906364929 from https://www.linkedin.com/posts/williamhgates_technology-transformtheeveryday-activity-7252381444906364929-Bkls
```


### LinkedIn notes

- only 1000 max results per search
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "staffspy"
version = "0.2.16"
version = "0.2.17"
description = "Staff scraper library for LinkedIn"
authors = ["Cullen Watson <[email protected]>"]
readme = "README.md"
Expand Down
15 changes: 15 additions & 0 deletions staffspy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd

from staffspy.linkedin.comments import CommentFetcher
from staffspy.linkedin.linkedin import LinkedInScraper
from staffspy.utils.models import Staff
from staffspy.solvers.capsolver import CapSolver
Expand Down Expand Up @@ -124,3 +125,17 @@ def scrape_users(self, user_ids: list[str]) -> pd.DataFrame:
users_df = pd.concat([non_linkedin_member_df, linkedin_member_df])
logger.info(f"Scraped {len(users_df)} users")
return users_df

def scrape_comments(self, post_ids: list[str]) -> pd.DataFrame:
"""Scrape comments from Linkedin by post IDs"""
comment_fetcher = CommentFetcher(self.session)
all_comments = []
for i, post_id in enumerate(post_ids, start=1):

comments = comment_fetcher.fetch_comments(post_id)
all_comments.extend(comments)

comment_dict = [comment.to_dict() for comment in all_comments]
comment_df = pd.DataFrame(comment_dict)

return comment_df
74 changes: 74 additions & 0 deletions staffspy/linkedin/comments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import re
from datetime import datetime as dt

from staffspy.utils.exceptions import TooManyRequests
from staffspy.utils.models import Comment

from staffspy.utils.utils import logger


class CommentFetcher:

def __init__(self, session):
self.session = session
self.endpoint = "https://www.linkedin.com/voyager/api/graphql?queryId=voyagerSocialDashComments.200c8ad7e1ad32ba4e5cc827ab5c3193&queryName=SocialDashCommentsBySocialDetail&variables=(origins:List(),sortOrder:REVERSE_CHRONOLOGICAL,count:100,socialDetailUrn:urn%3Ali%3Afsd_socialDetail%3A%28urn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3Aactivity%3A{post_id}%2Curn%3Ali%3AhighlightedReply%3A-%29,start:{start})"
self.post_id = None
self.num_commments = 100

def fetch_comments(self, post_id: str):
all_comments = []
self.post_id = post_id

for i in range(0, 100_000, self.num_commments):
logger.info(f"Fetching comments for post {post_id}, start {i}")

ep = self.endpoint.format(post_id=post_id, start=i)
res = self.session.get(ep)
logger.debug(f"comments info, status code - {res.status_code}")

if res.status_code == 429:
return TooManyRequests("429 Too Many Requests")
if not res.ok:
logger.debug(res.text[:200])
return False
try:
comments_json = res.json()
except json.decoder.JSONDecodeError:
logger.debug(res.text[:200])
return False

comments, num_results = self.parse_comments(comments_json)
all_comments.extend(comments)
if not num_results:
break

return all_comments

def parse_comments(self, comments_json: dict):
"""Parse the comment data from the employee profile."""
comments = []
for element in (
results := comments_json.get("data", {})
.get("socialDashCommentsBySocialDetail", {})
.get("elements", [])
):
internal_profile_id = (commenter := element["commenter"])[
"commenterProfileId"
]
name = commenter["title"]["text"]
linkedin_id_match = re.search("/in/(.+)", commenter["navigationUrl"])
linkedin_id = linkedin_id_match.group(1) if linkedin_id_match else None

commentary = element.get("commentary", {}).get("text", "")
comment = Comment(
post_id=self.post_id,
internal_profile_id=internal_profile_id,
public_profile_id=linkedin_id,
name=name,
text=commentary,
created_at=dt.utcfromtimestamp(element["createdAt"] / 1000),
)
comments.append(comment)

return comments, len(results)
20 changes: 20 additions & 0 deletions staffspy/utils/models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
from datetime import datetime, date

from pydantic import BaseModel
from datetime import datetime as dt

from staffspy.utils.utils import extract_emails_from_text


class Comment(BaseModel):
post_id: str
internal_profile_id: str | None = None
public_profile_id: str | None = None
name: str | None = None
text: str | None = None
created_at: dt | None = None

def to_dict(self):
return {
"post_id": self.post_id,
"internal_profile_id": self.internal_profile_id,
"public_profile_id": self.public_profile_id,
"name": self.name,
"text": self.text,
"created_at": self.created_at,
}


class School(BaseModel):
start_date: date | None = None
end_date: date | None = None
Expand Down
Loading