diff --git a/pyproject.toml b/pyproject.toml index 82ba55b..53b2fbf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "staffspy" -version = "0.2.9" +version = "0.2.10" description = "Staff scraper library for LinkedIn" authors = ["Cullen Watson "] readme = "README.md" diff --git a/staffspy/__init__.py b/staffspy/__init__.py index 06f31f0..3fabaa8 100644 --- a/staffspy/__init__.py +++ b/staffspy/__init__.py @@ -76,7 +76,7 @@ def scrape_staff( linkedin_member_df = staff_df[staff_df["name"] == "LinkedIn Member"] non_linkedin_member_df = staff_df[staff_df["name"] != "LinkedIn Member"] staff_df = pd.concat([non_linkedin_member_df, linkedin_member_df]) - logger.info(f"Scraped {len(staff_df)} staff members from {company_name}") + logger.info(f"Scraped {len(staff_df)} staff members from {company_name}, with {len(linkedin_member_df)} hidden LinkedIn users") return staff_df def scrape_users( diff --git a/staffspy/linkedin/employee.py b/staffspy/linkedin/employee.py index 405b7ad..364fcf2 100644 --- a/staffspy/linkedin/employee.py +++ b/staffspy/linkedin/employee.py @@ -43,16 +43,18 @@ def fetch_employee(self, base_staff, domain): def parse_emp(self, emp: Staff, emp_dict: dict): """Parse the employee data from the employee profile.""" - try: - photo_data = emp_dict["profilePicture"]["displayImageReference"][ - "vectorImage" - ] - photo_base_url = photo_data["rootUrl"] - photo_ext_url = photo_data["artifacts"][-1]["fileIdentifyingUrlPathSegment"] - profile_photo = f"{photo_base_url}{photo_ext_url}" - except (KeyError, TypeError, IndexError, ValueError) as e: - profile_photo = None + def get_photo_url(emp_dict: dict, key: str): + try: + photo_data = emp_dict[key]["displayImageReference"]["vectorImage"] + photo_base_url = photo_data["rootUrl"] + photo_ext_url = photo_data["artifacts"][-1]["fileIdentifyingUrlPathSegment"] + return f"{photo_base_url}{photo_ext_url}" + except (KeyError, TypeError, IndexError, ValueError): + return None + + emp.profile_photo = get_photo_url(emp_dict, "profilePicture") + emp.banner_photo = get_photo_url(emp_dict, "backgroundPicture") emp.profile_id = emp_dict["publicIdentifier"] try: emp.headline = emp_dict.get('headline') @@ -62,10 +64,10 @@ def parse_emp(self, emp: Staff, emp_dict: dict): pass emp.is_connection = next(iter(emp_dict['memberRelationship']['memberRelationshipUnion'])) == 'connection' emp.open_to_work = emp_dict['profilePicture'].get('frameType')=='OPEN_TO_WORK' + emp.is_hiring = emp_dict['profilePicture'].get('frameType')=='HIRING' emp.profile_link = f'https://www.linkedin.com/in/{emp_dict["publicIdentifier"]}' - emp.profile_photo = profile_photo emp.first_name = emp_dict["firstName"] emp.last_name = emp_dict["lastName"].split(',')[0] emp.potential_emails = utils.create_emails( diff --git a/staffspy/linkedin/skills.py b/staffspy/linkedin/skills.py index 38a530a..0daaf54 100644 --- a/staffspy/linkedin/skills.py +++ b/staffspy/linkedin/skills.py @@ -43,18 +43,23 @@ def parse_skills(self, sections): "components" ]["elements"] for elem in elems: + passed_assessment,endorsements = None,0 entity = elem["components"]["entityComponent"] name = entity["titleV2"]["text"]["text"] if name in names: continue names.add(name) - try: - endorsements = int( - entity["subComponents"]["components"][0]["components"][ - "insightComponent" - ]["text"]["text"]["text"].replace(" endorsements", "") - ) - except: - endorsements = 0 - skills.append(Skill(name=name, endorsements=endorsements)) + components = entity["subComponents"]["components"] + for component in components: + + try: + candidate = component["components"]["insightComponent"]["text"]["text"]["text"] + if " endorsements" in candidate: + endorsements = int(candidate.replace(" endorsements", "")) + if "Passed LinkedIn Skill Assessment" in candidate: + passed_assessment = True + except: + pass + + skills.append(Skill(name=name, endorsements=endorsements, passed_assessment=passed_assessment)) return skills diff --git a/staffspy/utils/models.py b/staffspy/utils/models.py index e9d006c..4d14722 100644 --- a/staffspy/utils/models.py +++ b/staffspy/utils/models.py @@ -23,11 +23,13 @@ def to_dict(self): class Skill(BaseModel): name: str | None = None endorsements: int | None = None + passed_assessment: bool | None = None def to_dict(self): return { "name": self.name, "endorsements": self.endorsements if self.endorsements else 0, + "passed_assessment": self.passed_assessment } @@ -94,7 +96,9 @@ class Staff(BaseModel): creator: bool | None = None premium: bool | None = None open_to_work: bool | None = None + is_hiring: bool | None = None profile_photo: str | None = None + banner_photo: str | None = None skills: list[Skill] | None = None experiences: list[Experience] | None = None certifications: list[Certification] | None = None @@ -156,6 +160,7 @@ def to_dict(self): "creator": self.creator, "influencer": self.influencer, "open_to_work": self.open_to_work, + "is_hiring": self.is_hiring, "current_position":self.current_position, "current_company": top_three_companies[0], "past_company_1": top_three_companies[1], @@ -186,6 +191,7 @@ def to_dict(self): "potential_emails": ', '.join(self.potential_emails) if self.potential_emails else None, "profile_link": self.profile_link, "profile_photo": self.profile_photo, + "banner_photo": self.banner_photo, } def estimate_age_based_on_education(self):