-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4b99511
commit 345ed6b
Showing
9 changed files
with
1,198 additions
and
402 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,15 +30,33 @@ staff = scrape_staff( | |
search_term="software engineer", # optional | ||
location="london", # optional | ||
extra_profile_data=True, # fetch all past experiences, schools, & skills | ||
|
||
username="[email protected]", | ||
password="mypassword", | ||
capsolver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", | ||
|
||
|
||
max_results=50, # can go up to 1000 | ||
session_file=str(session_file), # save browser cookies | ||
session_file=str(session_file), # save log in cookies to only log in once | ||
log_level=1, | ||
) | ||
filename = f"staff.csv" | ||
staff.to_csv(filename, index=False) | ||
``` | ||
A browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping. | ||
|
||
### Two login methods | ||
|
||
#### Requests login | ||
If you pass in a ```username``` & ```password```, it will sign in via LinkedIn api. If hit with a captcha, you need to pass ```capsolver_api_key```. | ||
|
||
|
||
#### Browser login | ||
|
||
If that fails or you rather use a browser, install the browser add-on to StaffSpy . | ||
|
||
```pip install staffspy[browser]``` | ||
|
||
Do not pass the ```username``` & ```password``` params, then a browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping. | ||
|
||
### Partial Output | ||
| name | position | profile_id | first_name | last_name | potential_email | company | school | location | followers | connections | premium | | ||
|
@@ -74,6 +92,15 @@ Optional | |
| file path to save session cookies, so only one manual login is needed. | ||
| can use mult profiles this way | ||
│ | ||
├── username (str): | ||
| linkedin account email | ||
│ | ||
├── password (str): | ||
| linkedin account password | ||
| | ||
├── capsolver_api_key (str): | ||
| solves the captcha using capsolver.com if hit with captcha on login | ||
│ | ||
├── log_level (int): | ||
| Controls the verbosity of the runtime printouts | ||
| (0 prints only errors, 1 is info, 2 is all logs. Default is 0.) | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,23 @@ | ||
[tool.poetry] | ||
name = "staffspy" | ||
version = "0.1.17" | ||
version = "0.2.0" | ||
description = "Staff scraper library for LinkedIn" | ||
authors = ["Cullen Watson <[email protected]>"] | ||
readme = "README.md" | ||
|
||
[tool.poetry.dependencies] | ||
python = "^3.10" | ||
selenium = "^4.21.0" | ||
pydantic = "^2.7.2" | ||
pandas = "^2.2.2" | ||
requests = "^2.32.3" | ||
tldextract = "^5.1.2" | ||
selenium = { version = "^4.3.0", optional = true } | ||
tenacity = "^8.5.0" | ||
python-dateutil = "^2.9.0.post0" | ||
beautifulsoup4 = "^4.12.3" | ||
|
||
[tool.poetry.extras] | ||
browser = ["selenium"] | ||
|
||
[tool.poetry.group.dev.dependencies] | ||
pre-commit = "^3.7.1" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import requests | ||
import json | ||
import time | ||
from tenacity import retry, stop_after_attempt, retry_if_result | ||
|
||
public_key = "3117BF26-4762-4F5A-8ED9-A85E69209A46" | ||
page_url = "https://iframe.arkoselabs.com" | ||
|
||
|
||
def is_none(value): | ||
return value is None | ||
|
||
|
||
@retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none)) | ||
def capsolver(blob_data: str, api_key: str): | ||
from staffspy.utils import logger | ||
|
||
payload = { | ||
"clientKey": api_key, | ||
"task": { | ||
"type": 'FunCaptchaTaskProxyLess', | ||
"websitePublicKey": public_key, | ||
"websiteURL": page_url, | ||
"data": json.dumps({"blob": blob_data}) if blob_data else '' | ||
} | ||
} | ||
res = requests.post("https://api.capsolver.com/createTask", json=payload) | ||
resp = res.json() | ||
task_id = resp.get("taskId") | ||
if not task_id: | ||
logger.info("Failed to create task:", res.text) | ||
return None | ||
logger.info(f"Got captcha solver taskId: {task_id} / Getting result...") | ||
|
||
while True: | ||
time.sleep(1) # delay | ||
payload = {"clientKey": api_key, "taskId": task_id} | ||
res = requests.post("https://api.capsolver.com/getTaskResult", json=payload) | ||
resp = res.json() | ||
status = resp.get("status") | ||
if status == "ready": | ||
return resp.get("solution", {}).get('token') | ||
if status == "failed" or resp.get("errorId"): | ||
logger.info("Captcha solve failed! response:", res.text) | ||
return None |
Oops, something went wrong.