enh: login with api (#16)

cullenwatson · Jul 26, 2024 · 345ed6b · 345ed6b
1 parent 4b99511
commit 345ed6b
Show file tree

Hide file tree

Showing 9 changed files with 1,198 additions and 402 deletions.
diff --git a/README.md b/README.md
@@ -30,15 +30,33 @@ staff = scrape_staff(
     search_term="software engineer", # optional
     location="london", # optional
     extra_profile_data=True, # fetch all past experiences, schools, & skills
+
+    username="[email protected]",
+    password="mypassword",
+    capsolver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC",
+
 
     max_results=50, # can go up to 1000
-    session_file=str(session_file), # save browser cookies
+    session_file=str(session_file), # save log in cookies to only log in once
     log_level=1,
 )
 filename = f"staff.csv"
 staff.to_csv(filename, index=False)
 ```
-A browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping.
+
+### Two login methods
+
+#### Requests login
+If you pass in a ```username``` & ```password```, it will sign in via LinkedIn api. If hit with a captcha, you need to pass ```capsolver_api_key```.
+
+
+#### Browser login
+
+If that fails or you rather use a browser, install the browser add-on to StaffSpy .
+
+```pip install staffspy[browser]```
+
+Do not pass the ```username``` & ```password``` params, then a browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping.
 
 ### Partial Output
 | name           | position                                   | profile_id          | first_name | last_name | potential_email              | company | school                                         | location                                 | followers | connections | premium |
@@ -74,6 +92,15 @@ Optional
 |    file path to save session cookies, so only one manual login is needed.
 |    can use mult profiles this way
 │
+├── username (str): 
+|    linkedin account email
+│
+├── password (str): 
+|    linkedin account password
+|
+├── capsolver_api_key (str): 
+|    solves the captcha using capsolver.com if hit with captcha on login
+│
 ├── log_level (int): 
 |    Controls the verbosity of the runtime printouts 
 |    (0 prints only errors, 1 is info, 2 is all logs. Default is 0.)

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,19 +1,23 @@
 [tool.poetry]
 name = "staffspy"
-version = "0.1.17"
+version = "0.2.0"
 description = "Staff scraper library for LinkedIn"
 authors = ["Cullen Watson <[email protected]>"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
 python = "^3.10"
-selenium = "^4.21.0"
 pydantic = "^2.7.2"
 pandas = "^2.2.2"
 requests = "^2.32.3"
 tldextract = "^5.1.2"
+selenium = { version = "^4.3.0", optional = true }
+tenacity = "^8.5.0"
 python-dateutil = "^2.9.0.post0"
+beautifulsoup4 = "^4.12.3"
 
+[tool.poetry.extras]
+browser = ["selenium"]
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.7.1"

diff --git a/staffspy/__init__.py b/staffspy/__init__.py
@@ -14,9 +14,13 @@ def scrape_staff(
     extra_profile_data: bool = False,
     max_results: int = 1000,
     log_level: int = 0,
+    username: str = None,
+    password: str = None,
+    capsolver_api_key: str = None
 ) -> pd.DataFrame:
     set_logger_level(log_level)
-    li = LinkedInScraper(session_file)
+
+    li = LinkedInScraper(session_file, username, password, capsolver_api_key)
 
     staff = li.scrape_staff(
         company_name=company_name,

diff --git a/staffspy/capsolver.py b/staffspy/capsolver.py
@@ -0,0 +1,45 @@
+import requests
+import json
+import time
+from tenacity import retry, stop_after_attempt, retry_if_result
+
+public_key = "3117BF26-4762-4F5A-8ED9-A85E69209A46"
+page_url = "https://iframe.arkoselabs.com"
+
+
+def is_none(value):
+    return value is None
+
+
+@retry(stop=stop_after_attempt(10), retry=retry_if_result(is_none))
+def capsolver(blob_data: str, api_key: str):
+    from staffspy.utils import logger
+
+    payload = {
+        "clientKey": api_key,
+        "task": {
+            "type": 'FunCaptchaTaskProxyLess',
+            "websitePublicKey": public_key,
+            "websiteURL": page_url,
+            "data": json.dumps({"blob": blob_data}) if blob_data else ''
+        }
+    }
+    res = requests.post("https://api.capsolver.com/createTask", json=payload)
+    resp = res.json()
+    task_id = resp.get("taskId")
+    if not task_id:
+        logger.info("Failed to create task:", res.text)
+        return None
+    logger.info(f"Got captcha solver taskId: {task_id} / Getting result...")
+
+    while True:
+        time.sleep(1)  # delay
+        payload = {"clientKey": api_key, "taskId": task_id}
+        res = requests.post("https://api.capsolver.com/getTaskResult", json=payload)
+        resp = res.json()
+        status = resp.get("status")
+        if status == "ready":
+            return resp.get("solution", {}).get('token')
+        if status == "failed" or resp.get("errorId"):
+            logger.info("Captcha solve failed! response:", res.text)
+            return None