Skip to content

Commit

Permalink
enh: login with api (#16)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Jul 26, 2024
1 parent 4b99511 commit 345ed6b
Show file tree
Hide file tree
Showing 9 changed files with 1,198 additions and 402 deletions.
31 changes: 29 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,33 @@ staff = scrape_staff(
search_term="software engineer", # optional
location="london", # optional
extra_profile_data=True, # fetch all past experiences, schools, & skills

username="[email protected]",
password="mypassword",
capsolver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC",


max_results=50, # can go up to 1000
session_file=str(session_file), # save browser cookies
session_file=str(session_file), # save log in cookies to only log in once
log_level=1,
)
filename = f"staff.csv"
staff.to_csv(filename, index=False)
```
A browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping.

### Two login methods

#### Requests login
If you pass in a ```username``` & ```password```, it will sign in via LinkedIn api. If hit with a captcha, you need to pass ```capsolver_api_key```.


#### Browser login

If that fails or you rather use a browser, install the browser add-on to StaffSpy .

```pip install staffspy[browser]```

Do not pass the ```username``` & ```password``` params, then a browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping.

### Partial Output
| name | position | profile_id | first_name | last_name | potential_email | company | school | location | followers | connections | premium |
Expand Down Expand Up @@ -74,6 +92,15 @@ Optional
| file path to save session cookies, so only one manual login is needed.
| can use mult profiles this way
├── username (str):
| linkedin account email
├── password (str):
| linkedin account password
|
├── capsolver_api_key (str):
| solves the captcha using capsolver.com if hit with captcha on login
├── log_level (int):
| Controls the verbosity of the runtime printouts
| (0 prints only errors, 1 is info, 2 is all logs. Default is 0.)
Expand Down
793 changes: 428 additions & 365 deletions poetry.lock

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
[tool.poetry]
name = "staffspy"
version = "0.1.17"
version = "0.2.0"
description = "Staff scraper library for LinkedIn"
authors = ["Cullen Watson <[email protected]>"]
readme = "README.md"

[tool.poetry.dependencies]
python = "^3.10"
selenium = "^4.21.0"
pydantic = "^2.7.2"
pandas = "^2.2.2"
requests = "^2.32.3"
tldextract = "^5.1.2"
selenium = { version = "^4.3.0", optional = true }
tenacity = "^8.5.0"
python-dateutil = "^2.9.0.post0"
beautifulsoup4 = "^4.12.3"

[tool.poetry.extras]
browser = ["selenium"]

[tool.poetry.group.dev.dependencies]
pre-commit = "^3.7.1"
Expand Down
6 changes: 5 additions & 1 deletion staffspy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,13 @@ def scrape_staff(
extra_profile_data: bool = False,
max_results: int = 1000,
log_level: int = 0,
username: str = None,
password: str = None,
capsolver_api_key: str = None
) -> pd.DataFrame:
set_logger_level(log_level)
li = LinkedInScraper(session_file)

li = LinkedInScraper(session_file, username, password, capsolver_api_key)

staff = li.scrape_staff(
company_name=company_name,
Expand Down
45 changes: 45 additions & 0 deletions staffspy/capsolver.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import requests
import json
import time
from tenacity import retry, stop_after_attempt, retry_if_result

public_key = "3117BF26-4762-4F5A-8ED9-A85E69209A46"
page_url = "https://iframe.arkoselabs.com"


def is_none(value):
return value is None


@retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none))
def capsolver(blob_data: str, api_key: str):
from staffspy.utils import logger

payload = {
"clientKey": api_key,
"task": {
"type": 'FunCaptchaTaskProxyLess',
"websitePublicKey": public_key,
"websiteURL": page_url,
"data": json.dumps({"blob": blob_data}) if blob_data else ''
}
}
res = requests.post("https://api.capsolver.com/createTask", json=payload)
resp = res.json()
task_id = resp.get("taskId")
if not task_id:
logger.info("Failed to create task:", res.text)
return None
logger.info(f"Got captcha solver taskId: {task_id} / Getting result...")

while True:
time.sleep(1) # delay
payload = {"clientKey": api_key, "taskId": task_id}
res = requests.post("https://api.capsolver.com/getTaskResult", json=payload)
resp = res.json()
status = resp.get("status")
if status == "ready":
return resp.get("solution", {}).get('token')
if status == "failed" or resp.get("errorId"):
logger.info("Captcha solve failed! response:", res.text)
return None
Loading

0 comments on commit 345ed6b

Please sign in to comment.