From 7728194b7b23a8a4074704302cb03b50fb371681 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 2 Aug 2024 11:56:10 -0500 Subject: [PATCH 1/5] docs:readme --- README.md | 97 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index dc4bc6d..2862584 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ account = LinkedInAccount( password="mypassword", solver_api_key="CAP-6D6A8CE981803A309A0D531F8B4790BC", # optional but needed if hit with captcha solver_service=SolverType.CAPSOLVER, - + session_file=str(session_file), # save login cookies to only log in once (lasts a week or so) log_level=1, # 0 for no logs ) @@ -56,68 +56,89 @@ users.to_csv("users.csv", index=False) If you rather use a browser to log in, install the browser add-on to StaffSpy . -```pip install staffspy[browser]``` +`pip install staffspy[browser]` -Do not pass the ```username``` & ```password``` params, then a browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping. +Do not pass the `username` & `password` params, then a browser will open to sign in to LinkedIn on the first sign-in. Press enter after signing in to begin scraping. ### Output -| profile_id | name | first_name | last_name | location | age | position | followers | connections | premium | company | past_company1 | past_company2 | school | extra_school | skill1 | skill2 | skill3 | is_connection | premium | creator | potential_email | profile_link | profile_photo | -|----------------|---------------|------------|-----------|------------------------------------------|-----|--------------------------------------------|-----------|-------------|---------|---------|---------------|---------------|-----------------------------------------------|-------------------------------|-----------|-------------|------------|---------------|----------|---------|----------------------------------------------|---------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------| -| javiersierra2102 | Javier Sierra | Javier | Sierra | London, England, United Kingdom | 39 | Software Engineer | 735 | 725 | FALSE | OpenAI | Meta | Oculus VR | Hult International Business School | Universidad Simón Bolívar | Java | JavaScript | C++ | FALSE | FALSE | FALSE | javier.sierra@openai.com, jsierra@openai.com | https://www.linkedin.com/in/javiersierra2102 | https://media.licdn.com/dms/image/C4D03AQHEyUg1kGT08Q/profile-displayphoto-shrink_800_800/0/1516504680512?e=1727913600&v=beta&t=3enCmNDBtJ7LxfbW6j1hDD8qNtHjO2jb2XTONECxUXw | -| dougli | Douglas Li | Douglas | Li | London, England, United Kingdom | 37 | @ OpenAI UK, previously at Meta | 583 | 401 | FALSE | OpenAI | Shift Lab | Facebook | Washington University in St. Louis | | Java | Python | JavaScript | FALSE | TRUE | FALSE | douglas.li@openai.com, dli@openai.com | https://www.linkedin.com/in/dougli | https://media.licdn.com/dms/image/D4E03AQETmRyb3_GB8A/profile-displayphoto-shrink_800_800/0/1687996628597?e=1727913600&v=beta&t=HRYGJ4RxsTMcPF1YcSikXlbz99hx353csho3PWT6fOQ | -| nkartashov | Nick Kartashov| Nick | Kartashov | London, England, United Kingdom | 33 | Software Engineer | 2186 | 2182 | TRUE | OpenAI | Google | DeepMind | St. Petersburg Academic University | Bioinformatics Institute | Teamwork | Java | Haskell | FALSE | FALSE | FALSE | nick.kartashov@openai.com, nkartashov@openai.com | https://www.linkedin.com/in/nkartashov | https://media.licdn.com/dms/image/D4E03AQEjOKxC5UgwWw/profile-displayphoto-shrink_800_800/0/1680706122689?e=1727913600&v=beta&t=m-JnG9nm0zxp1Z7njnInwbCoXyqa3AN-vJZntLfbzQ4 | +| profile_id | name | first_name | last_name | location | age | position | followers | connections | premium | company | past_company1 | past_company2 | school | extra_school | skill1 | skill2 | skill3 | is_connection | premium | creator | potential_email | profile_link | profile_photo | +| ---------------- | -------------- | ---------- | --------- | ------------------------------- | --- | ------------------------------- | --------- | ----------- | ------- | ------- | ------------- | ------------- | ---------------------------------- | ------------------------- | -------- | ---------- | ---------- | ------------- | ------- | ------- | ------------------------------------------------ | -------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| javiersierra2102 | Javier Sierra | Javier | Sierra | London, England, United Kingdom | 39 | Software Engineer | 735 | 725 | FALSE | OpenAI | Meta | Oculus VR | Hult International Business School | Universidad Simón Bolívar | Java | JavaScript | C++ | FALSE | FALSE | FALSE | javier.sierra@openai.com, jsierra@openai.com | https://www.linkedin.com/in/javiersierra2102 | https://media.licdn.com/dms/image/C4D03AQHEyUg1kGT08Q/profile-displayphoto-shrink_800_800/0/1516504680512?e=1727913600&v=beta&t=3enCmNDBtJ7LxfbW6j1hDD8qNtHjO2jb2XTONECxUXw | +| dougli | Douglas Li | Douglas | Li | London, England, United Kingdom | 37 | @ OpenAI UK, previously at Meta | 583 | 401 | FALSE | OpenAI | Shift Lab | Facebook | Washington University in St. Louis | | Java | Python | JavaScript | FALSE | TRUE | FALSE | douglas.li@openai.com, dli@openai.com | https://www.linkedin.com/in/dougli | https://media.licdn.com/dms/image/D4E03AQETmRyb3_GB8A/profile-displayphoto-shrink_800_800/0/1687996628597?e=1727913600&v=beta&t=HRYGJ4RxsTMcPF1YcSikXlbz99hx353csho3PWT6fOQ | +| nkartashov | Nick Kartashov | Nick | Kartashov | London, England, United Kingdom | 33 | Software Engineer | 2186 | 2182 | TRUE | OpenAI | Google | DeepMind | St. Petersburg Academic University | Bioinformatics Institute | Teamwork | Java | Haskell | FALSE | FALSE | FALSE | nick.kartashov@openai.com, nkartashov@openai.com | https://www.linkedin.com/in/nkartashov | https://media.licdn.com/dms/image/D4E03AQEjOKxC5UgwWw/profile-displayphoto-shrink_800_800/0/1680706122689?e=1727913600&v=beta&t=m-JnG9nm0zxp1Z7njnInwbCoXyqa3AN-vJZntLfbzQ4 | +### Parameters for `LinkedInAccount()` +```plaintext +Optional +├── session_file (str): +| file path to save session cookies, so only one manual login is needed. +| can use mult profiles this way +| +| For automated login +├── username (str): +| linkedin account email +│ +├── password (str): +| linkedin account password +| +├── solver_service (SolverType): +| solves the captcha using the desired service - either CapSolver, or 2Captcha (worse of the two) +| +├── solver_api_key (str): +| api key for the solver provider +│ +├── log_level (int): +| Controls the verbosity of the runtime printouts +| (0 prints only errors, 1 is info, 2 is all logs. Default is 0.) +``` ### Parameters for `scrape_staff()` ```plaintext -Optional -├── company_name (str): +Optional +├── company_name (str): | company identifier on linkedin, will search for that company if that company id does not exist | e.g. openai from https://www.linkedin.com/company/openai | -├── user_id (str): -| alternative to company_name, provide user identifier on linkedin, will find this user's company and then proceed +├── user_id (str): +| alternative to company_name, provide user identifier on linkedin, will scrape this user's company | e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon | -├── search_term (str): +├── search_term (str): | staff title to search for | e.g. software engineer | -├── location (str): +├── location (str): | location the staff resides | e.g. london │ ├── extra_profile_data (bool) | fetches educations, experiences, skills, certifications (Default false) │ -├── max_results (int): +├── max_results (int): | number of staff to fetch, default/max is 1000 for a search imposed by LinkedIn -│ -├── session_file (str): -| file path to save session cookies, so only one manual login is needed. -| can use mult profiles this way -│ -├── username (str): -| linkedin account email -│ -├── password (str): -| linkedin account password -| -├── solver_service (SolverType): -| solves the captcha using the desired service - either CapSolver, or 2Captcha (worse of the two) -| -├── solver_api_key (str): -| api key for the solver provider -│ -├── log_level (int): -| Controls the verbosity of the runtime printouts -| (0 prints only errors, 1 is info, 2 is all logs. Default is 0.) ``` +### Parameters for `scrape_users()` + +```plaintext +Optional +├── user_ids (list): +| user ids to scrape from +| e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon +``` + +### LinkedIn notes + + - only 1000 max results per search + - extra_profile_data increases runtime by O(n) + - if rate limited, the program will exit + + ### Staff Schema + ```plaintext Staff ├── Personal Information @@ -175,9 +196,7 @@ Staff ├── school └── degree ``` -### LinkedIn notes - - only 1000 max results per search - - extra_profile_data increases runtime by O(n) +--- ## Frequently Asked Questions @@ -196,5 +215,3 @@ Staff **Q: Encountering issues with your queries?** **A:** If problems persist, [submit an issue](https://github.com/cullenwatson/StaffSpy/issues). - ---- From b2049f3a0a2f6160c80022d1e26c6c7f401f885f Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 2 Aug 2024 11:57:05 -0500 Subject: [PATCH 2/5] docs:readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 2862584..7ec84ac 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,6 @@ Optional ### Parameters for `scrape_users()` ```plaintext -Optional ├── user_ids (list): | user ids to scrape from | e.g. dougmcmillon from https://www.linkedin.com/in/dougmcmillon From 5a4e3fda1d08de0d76b7428597d830f8abe0270f Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 2 Aug 2024 12:06:09 -0500 Subject: [PATCH 3/5] docs:readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7ec84ac..70d6424 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ Optional - only 1000 max results per search - extra_profile_data increases runtime by O(n) - - if rate limited, the program will exit + - if rate limited, the program will stop scraping ### Staff Schema From 8ffa44808944629de2c05507263694408e362756 Mon Sep 17 00:00:00 2001 From: Cullen Watson Date: Fri, 2 Aug 2024 13:53:38 -0500 Subject: [PATCH 4/5] docs:readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 70d6424..dc4c436 100644 --- a/README.md +++ b/README.md @@ -134,6 +134,7 @@ Optional - only 1000 max results per search - extra_profile_data increases runtime by O(n) - if rate limited, the program will stop scraping + - if using non-browser sign in, turn off 2fa ### Staff Schema From 86c5849541b407a28676232251732bb0c6724202 Mon Sep 17 00:00:00 2001 From: GrimsGreen <156634727+GrimsGreen@users.noreply.github.com> Date: Sun, 4 Aug 2024 06:27:31 +0330 Subject: [PATCH 5/5] Fix company_name parser (#35) --- staffspy/linkedin/linkedin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/staffspy/linkedin/linkedin.py b/staffspy/linkedin/linkedin.py index c5249b3..d282f4f 100644 --- a/staffspy/linkedin/linkedin.py +++ b/staffspy/linkedin/linkedin.py @@ -117,7 +117,7 @@ def get_company_id_and_staff_count(self, company_name: str): company_id = company["trackingInfo"]["objectUrn"].split(":")[-1] try: - company_name = re.search(r'/company/([^/]+)', company['url']).group(1) + company_name = company["universalName"] except: pass