Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
323 changes: 322 additions & 1 deletion linkedin_mcp_server/scraping/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from dataclasses import dataclass
import logging
import re
from typing import Any, Literal
from typing import Any, Callable, Awaitable, Literal
from urllib.parse import quote_plus

from patchright.async_api import Page, TimeoutError as PlaywrightTimeoutError
Expand All @@ -17,6 +17,7 @@
from linkedin_mcp_server.core.exceptions import (
AuthenticationError,
LinkedInScraperException,
RateLimitError,
)
from linkedin_mcp_server.debug_trace import record_page_trace
from linkedin_mcp_server.debug_utils import stabilize_navigation
Expand Down Expand Up @@ -155,6 +156,87 @@ def _truncate_linkedin_noise(text: str) -> str:
return text[:earliest].strip()


def _parse_contact_record(
profile_text: str, contact_text: str
) -> dict[str, str | None]:
"""Parse raw innerText blobs into structured contact fields.

Profile text layout (first lines):
Name\\n\\n· 1st\\n\\nHeadline\\n\\nLocation\\n\\n·\\n\\nContact info\\n\\nCompany

Contact info overlay layout:
Email\\n\\nuser@example.com\\n\\nPhone\\n\\n+123...\\n\\n...
"""
result: dict[str, str | None] = {
"first_name": None,
"last_name": None,
"headline": None,
"location": None,
"company": None,
"email": None,
"phone": None,
"website": None,
"birthday": None,
}

# --- Parse profile text ---
if profile_text:
lines = [ln.strip() for ln in profile_text.split("\n")]
non_empty = [ln for ln in lines if ln]

if non_empty:
# Line 1 → full name
full_name = non_empty[0]
parts = full_name.split(None, 1)
result["first_name"] = parts[0] if parts else full_name
result["last_name"] = parts[1] if len(parts) > 1 else None

# Find connection degree marker (· 1st, · 2nd, · 3rd, · 3rd+)
degree_idx: int | None = None
for i, ln in enumerate(non_empty):
if re.match(r"^·\s*\d+(st|nd|rd|th)\+?$", ln):
degree_idx = i
break

if degree_idx is not None and degree_idx + 1 < len(non_empty):
result["headline"] = non_empty[degree_idx + 1]

# Location is the next non-empty line after headline
if degree_idx + 2 < len(non_empty):
candidate = non_empty[degree_idx + 2]
# Skip if it's just the "·" separator or "Contact info"
if candidate not in ("·", "Contact info"):
result["location"] = candidate

# Company: line after "Contact info"
for i, ln in enumerate(non_empty):
if ln == "Contact info" and i + 1 < len(non_empty):
result["company"] = non_empty[i + 1]
break

# --- Parse contact info overlay ---
if contact_text:
# Extract labeled fields: "Label\n\nvalue"
for field, label in [
("email", "Email"),
("phone", "Phone"),
("birthday", "Birthday"),
]:
match = re.search(
rf"(?:^|\n){re.escape(label)}\s*\n\s*\n\s*(.+)",
contact_text,
)
if match:
result[field] = match.group(1).strip()

# Website may include a type annotation like "(Blog)" or "(Portfolio)"
match = re.search(r"(?:^|\n)Website\s*\n\s*\n\s*(.+)", contact_text)
if match:
result["website"] = match.group(1).strip()

return result


class LinkedInExtractor:
"""Extracts LinkedIn page content via navigate-scroll-innerText pattern."""

Expand Down Expand Up @@ -1015,15 +1097,25 @@ async def search_people(
self,
keywords: str,
location: str | None = None,
network: str | None = None,
) -> dict[str, Any]:
"""Search for people and extract the results page.

Args:
keywords: Search keywords.
location: Optional location filter.
network: Optional connection degree filter.
"F" = 1st degree, "S" = 2nd degree, "O" = 3rd+.

Returns:
{url, sections: {name: text}}
"""
params = f"keywords={quote_plus(keywords)}"
if location:
params += f"&location={quote_plus(location)}"
if network:
# LinkedIn expects network=%5B%22F%22%5D (URL-encoded ["F"])
params += f"&network=%5B%22{quote_plus(network)}%22%5D"

url = f"https://www.linkedin.com/search/results/people/?{params}"
extracted = await self.extract_page(url, section_name="search_results")
Expand Down Expand Up @@ -1156,3 +1248,232 @@ async def _extract_root_content(
{"selectors": selectors},
)
return result

# ------------------------------------------------------------------
# Connections bulk export
# ------------------------------------------------------------------

async def scrape_connections_list(
self,
limit: int = 0,
max_scrolls: int = 50,
) -> dict[str, Any]:
"""Scrape the authenticated user's connections list via infinite scroll.

Args:
limit: Maximum connections to return (0 = unlimited).
max_scrolls: Maximum scroll iterations (~1s pause each).

Returns:
{connections: [{username, name, headline}, ...], total, url, pages_visited}
"""
url = "https://www.linkedin.com/mynetwork/invite-connect/connections/"

# Navigate — handle ERR_ABORTED (page already loaded / redirect race)
try:
await self._page.goto(url, wait_until="domcontentloaded", timeout=30000)
except Exception as nav_err:
if "ERR_ABORTED" in str(nav_err):
logger.info("Navigation aborted (page may already be loaded), retrying")
await asyncio.sleep(2.0)
await self._page.goto(url, wait_until="domcontentloaded", timeout=30000)
else:
raise

await detect_rate_limit(self._page)

try:
Comment on lines +1270 to +1285
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

chunk_size=0 causes an unhandled ValueError

range(0, total, chunk_size) raises ValueError: range() arg 3 must not be zero when chunk_size is 0. This exception propagates to the tool handler in connections.py and is returned via handle_tool_error, but the error message ("range() arg 3 must not be zero") is opaque to the caller.

A simple guard at the top of the method would produce a much clearer error:

async def scrape_contact_batch(
    self,
    usernames: list[str],
    chunk_size: int = 5,
    chunk_delay: float = 30.0,
    progress_cb: Callable[[int, int], Awaitable[None]] | None = None,
) -> dict[str, Any]:
    if chunk_size <= 0:
        raise ValueError(f"chunk_size must be a positive integer, got {chunk_size}")
Prompt To Fix With AI
This is a comment left during a code review.
Path: linkedin_mcp_server/scraping/extractor.py
Line: 514-519

Comment:
**`chunk_size=0` causes an unhandled `ValueError`**

`range(0, total, chunk_size)` raises `ValueError: range() arg 3 must not be zero` when `chunk_size` is `0`. This exception propagates to the tool handler in `connections.py` and is returned via `handle_tool_error`, but the error message ("range() arg 3 must not be zero") is opaque to the caller.

A simple guard at the top of the method would produce a much clearer error:

```python
async def scrape_contact_batch(
    self,
    usernames: list[str],
    chunk_size: int = 5,
    chunk_delay: float = 30.0,
    progress_cb: Callable[[int, int], Awaitable[None]] | None = None,
) -> dict[str, Any]:
    if chunk_size <= 0:
        raise ValueError(f"chunk_size must be a positive integer, got {chunk_size}")
```

How can I resolve this? If you propose a fix, please make it concise.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in beebf5e — added if chunk_size <= 0: raise ValueError(...) guard at the top of scrape_contact_batch.

await self._page.wait_for_selector("main", timeout=10000)
except PlaywrightTimeoutError:
logger.debug("No <main> element on connections page")

await handle_modal_close(self._page)

# Deep scroll to load all connections (infinite scroll)
await scroll_to_bottom(self._page, pause_time=1.0, max_scrolls=max_scrolls)

# Stabilize — LinkedIn may trigger lazy navigations during scroll
await asyncio.sleep(1.0)

# Ensure we're still on the connections page; re-navigate if needed
current_url = self._page.url
if "/connections" not in current_url:
logger.warning(
"Page navigated away to %s during scroll, re-navigating",
current_url,
)
await self._page.goto(url, wait_until="domcontentloaded", timeout=30000)
await asyncio.sleep(2.0)

# Extract connection data from profile link elements
raw_connections: list[dict[str, str]] = await self._page.evaluate(
"""() => {
const results = [];
const seen = new Set();
const links = document.querySelectorAll('main a[href*="/in/"]');
for (const a of links) {
const href = a.getAttribute('href') || '';
const match = href.match(/\\/in\\/([^/?#]+)/);
if (!match) continue;
const username = match[1];
if (seen.has(username)) continue;
seen.add(username);

// Walk up to the connection card container
const card = a.closest('li') || a.parentElement;

// Name: try known selectors, then the link's own visible text
let name = '';
if (card) {
const nameEl = card.querySelector(
'.mn-connection-card__name, .entity-result__title-text, span[dir="ltr"], span.t-bold'
);
if (nameEl) name = nameEl.innerText.trim();
}
if (!name) {
// The profile link itself often contains the person's name
const linkText = a.innerText.trim();
if (linkText && linkText.length < 80) name = linkText;
}

// Headline: try known selectors, then parse card text
let headline = '';
if (card) {
const headlineEl = card.querySelector(
'.mn-connection-card__occupation, .entity-result__primary-subtitle, span.t-normal'
);
if (headlineEl) headline = headlineEl.innerText.trim();
}
if (!headline && card) {
// Fallback: split card text by newlines, second non-empty line is usually headline
const lines = card.innerText.split('\\n').map(l => l.trim()).filter(Boolean);
Comment on lines +1339 to +1349
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Soft rate-limit sentinel silently corrupts contact records

extract_page returns the module-level _RATE_LIMITED_MSG string sentinel ("[Rate limited] LinkedIn blocked this section…") when a soft rate limit persists after one retry, instead of raising RateLimitError. scrape_contact_batch never checks for this sentinel before calling _parse_contact_record, so the sentinel is treated as valid profile text.

The result is a silently corrupted record:

  • first_name"[Rate"
  • last_name"limited] LinkedIn blocked this section. Try again later or request fewer sections."
  • headline, location, companyNone
  • The contact overlay is still scraped unnecessarily
  • The record is added to contacts with no error indication (only profile_raw would reveal the problem)

The same risk applies if _extract_overlay returns the sentinel for contact_text.

A guard should be added before calling _parse_contact_record:

# Scrape main profile page
profile_text = await self.extract_page(profile_url)
pages_visited.append(profile_url)

if profile_text == _RATE_LIMITED_MSG:
    logger.warning("Soft rate limit on profile %s, skipping", username)
    failed.append(username)
    await asyncio.sleep(_NAV_DELAY)
    continue

# Scrape contact info overlay
contact_text = await self._extract_overlay(contact_url)
pages_visited.append(contact_url)

if contact_text == _RATE_LIMITED_MSG:
    contact_text = ""  # fall back to empty; parsed fields will be None
Prompt To Fix With AI
This is a comment left during a code review.
Path: linkedin_mcp_server/scraping/extractor.py
Line: 560-570

Comment:
**Soft rate-limit sentinel silently corrupts contact records**

`extract_page` returns the module-level `_RATE_LIMITED_MSG` string sentinel (`"[Rate limited] LinkedIn blocked this section…"`) when a soft rate limit persists after one retry, instead of raising `RateLimitError`. `scrape_contact_batch` never checks for this sentinel before calling `_parse_contact_record`, so the sentinel is treated as valid profile text.

The result is a silently corrupted record:
- `first_name``"[Rate"`
- `last_name``"limited] LinkedIn blocked this section. Try again later or request fewer sections."`
- `headline`, `location`, `company``None`
- The contact overlay is still scraped unnecessarily
- The record is added to `contacts` with no error indication (only `profile_raw` would reveal the problem)

The same risk applies if `_extract_overlay` returns the sentinel for `contact_text`.

A guard should be added before calling `_parse_contact_record`:

```python
# Scrape main profile page
profile_text = await self.extract_page(profile_url)
pages_visited.append(profile_url)

if profile_text == _RATE_LIMITED_MSG:
    logger.warning("Soft rate limit on profile %s, skipping", username)
    failed.append(username)
    await asyncio.sleep(_NAV_DELAY)
    continue

# Scrape contact info overlay
contact_text = await self._extract_overlay(contact_url)
pages_visited.append(contact_url)

if contact_text == _RATE_LIMITED_MSG:
    contact_text = ""  # fall back to empty; parsed fields will be None
```

How can I resolve this? If you propose a fix, please make it concise.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in beebf5e — added guards for both _RATE_LIMITED_MSG sentinels. Profile sentinel skips the username (added to failed), contact overlay sentinel falls back to empty string.

if (lines.length >= 2) headline = lines[1];
}

results.push({ username, name, headline });
}
return results;
}"""
)

# Apply limit
if limit > 0:
raw_connections = raw_connections[:limit]
Comment on lines +1293 to +1361
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inefficient when limit is small - scrolls through all connections before truncating.

If limit=10 but user has 500 connections, this scrolls through all 500 (~8 minutes with 1s pauses), then discards 490. Consider checking len(results) >= limit inside the JavaScript loop and breaking early.

Prompt To Fix With AI
This is a comment left during a code review.
Path: linkedin_mcp_server/scraping/extractor.py
Line: 527-582

Comment:
Inefficient when `limit` is small - scrolls through all connections before truncating.

If `limit=10` but user has 500 connections, this scrolls through all 500 (~8 minutes with 1s pauses), then discards 490. Consider checking `len(results) >= limit` inside the JavaScript loop and breaking early.

How can I resolve this? If you propose a fix, please make it concise.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't fix — the suggestion to break early in the JS loop wouldn't help because the expensive part is scroll_to_bottom(), which runs before the JS extraction. By the time the DOM query executes, all scrolling is already done. Users already control scroll depth via the max_scrolls parameter (e.g. max_scrolls=3 for quick results). A proper fix would require refactoring the generic scroll_to_bottom utility to accept an early-exit predicate, which is out of scope for this PR.


return {
"connections": raw_connections,
"total": len(raw_connections),
"url": url,
"pages_visited": [url],
}

async def scrape_contact_batch(
self,
usernames: list[str],
chunk_size: int = 5,
chunk_delay: float = 30.0,
progress_cb: Callable[[int, int], Awaitable[None]] | None = None,
) -> dict[str, Any]:
"""Enrich a list of profiles with contact details in chunked batches.

For each username: scrapes main profile + contact_info overlay.

Comment on lines +1375 to +1380
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Rate-limited username is not added to failed

When RateLimitError is caught, the current username is not appended to failed before breaking out of the loop. The return value only signals rate_limited: True but doesn't record which username triggered the stop, making it difficult for callers to resume from where processing halted.

except RateLimitError:
    logger.warning("Rate limited during contact batch at %s", username)
    failed.append(username)  # record the username that triggered the stop
    rate_limited = True
    break
Prompt To Fix With AI
This is a comment left during a code review.
Path: linkedin_mcp_server/scraping/extractor.py
Line: 596-601

Comment:
**Rate-limited username is not added to `failed`**

When `RateLimitError` is caught, the current username is not appended to `failed` before breaking out of the loop. The return value only signals `rate_limited: True` but doesn't record which username triggered the stop, making it difficult for callers to resume from where processing halted.

```python
except RateLimitError:
    logger.warning("Rate limited during contact batch at %s", username)
    failed.append(username)  # record the username that triggered the stop
    rate_limited = True
    break
```

How can I resolve this? If you propose a fix, please make it concise.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in beebf5e — added failed.append(username) before the break.

Args:
usernames: List of LinkedIn usernames to enrich.
chunk_size: Profiles per chunk before a long pause.
chunk_delay: Seconds to pause between chunks.
progress_cb: Optional async callback(completed, total) for progress.

Returns:
{contacts: [{username, first_name, last_name, email, phone,
headline, location, company, website, birthday,
profile_raw, contact_info_raw}],
total, failed, rate_limited, pages_visited}
"""
if chunk_size <= 0:
raise ValueError(f"chunk_size must be a positive integer, got {chunk_size}")

contacts: list[dict[str, Any]] = []
failed: list[str] = []
pages_visited: list[str] = []
total = len(usernames)
rate_limited = False

for chunk_idx in range(0, total, chunk_size):
chunk = usernames[chunk_idx : chunk_idx + chunk_size]

for username in chunk:
profile_url = f"https://www.linkedin.com/in/{username}/"
contact_url = (
f"https://www.linkedin.com/in/{username}/overlay/contact-info/"
)

try:
# Scrape main profile page
profile_text = await self.extract_page(profile_url)
pages_visited.append(profile_url)

if profile_text == _RATE_LIMITED_MSG:
logger.warning(
"Soft rate limit on profile %s, skipping", username
)
failed.append(username)
await asyncio.sleep(_NAV_DELAY)
continue

# Scrape contact info overlay
contact_text = await self._extract_overlay(contact_url)
pages_visited.append(contact_url)

if contact_text == _RATE_LIMITED_MSG:
contact_text = (
"" # fall back to empty; parsed fields will be None
)

parsed = _parse_contact_record(profile_text, contact_text)
Comment on lines +1413 to +1433
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P0 extract_page / _extract_overlay called with wrong signature — every profile fails

Both extract_page and _extract_overlay have a required section_name: str second parameter (see lines 440–443 and 553–556 respectively). Calling them without it raises TypeError: extract_page() missing 1 required positional argument: 'section_name' / _extract_overlay() missing 1 required positional argument: 'section_name' for every iteration.

That TypeError is silently swallowed by the except Exception block, so every username ends up in failed and contacts is always empty — making extract_contact_details functionally broken.

Even after adding the missing argument, extract_page and _extract_overlay return ExtractedSection objects, not raw strings. The current comparisons to _RATE_LIMITED_MSG (e.g. if profile_text == _RATE_LIMITED_MSG) will always be False (comparing dataclass to str), and passing the objects directly to _parse_contact_record(profile_text, contact_text) would raise AttributeError: 'ExtractedSection' object has no attribute 'split'. The rest of the codebase consistently accesses .text (e.g. line 1126: if extracted.text and extracted.text != _RATE_LIMITED_MSG).

The fix requires both changes together:

                    # Scrape main profile page
                    extracted_profile = await self.extract_page(profile_url, section_name="profile")
                    pages_visited.append(profile_url)
                    profile_text = extracted_profile.text

                    if profile_text == _RATE_LIMITED_MSG:
                        logger.warning(
                            "Soft rate limit on profile %s, skipping", username
                        )
                        failed.append(username)
                        await asyncio.sleep(_NAV_DELAY)
                        continue

                    # Scrape contact info overlay
                    extracted_contact = await self._extract_overlay(contact_url, section_name="contact_info")
                    pages_visited.append(contact_url)
                    contact_text = extracted_contact.text

                    if contact_text == _RATE_LIMITED_MSG:
                        contact_text = ""  # fall back to empty; parsed fields will be None
Prompt To Fix With AI
This is a comment left during a code review.
Path: linkedin_mcp_server/scraping/extractor.py
Line: 1413-1433

Comment:
**`extract_page` / `_extract_overlay` called with wrong signature — every profile fails**

Both `extract_page` and `_extract_overlay` have a required `section_name: str` second parameter (see lines 440–443 and 553–556 respectively). Calling them without it raises `TypeError: extract_page() missing 1 required positional argument: 'section_name'` / `_extract_overlay() missing 1 required positional argument: 'section_name'` for every iteration.

That `TypeError` is silently swallowed by the `except Exception` block, so every username ends up in `failed` and `contacts` is always empty — making `extract_contact_details` functionally broken.

Even after adding the missing argument, `extract_page` and `_extract_overlay` return `ExtractedSection` objects, not raw strings. The current comparisons to `_RATE_LIMITED_MSG` (e.g. `if profile_text == _RATE_LIMITED_MSG`) will always be `False` (comparing dataclass to `str`), and passing the objects directly to `_parse_contact_record(profile_text, contact_text)` would raise `AttributeError: 'ExtractedSection' object has no attribute 'split'`. The rest of the codebase consistently accesses `.text` (e.g. line 1126: `if extracted.text and extracted.text != _RATE_LIMITED_MSG`).

The fix requires both changes together:

```python
                    # Scrape main profile page
                    extracted_profile = await self.extract_page(profile_url, section_name="profile")
                    pages_visited.append(profile_url)
                    profile_text = extracted_profile.text

                    if profile_text == _RATE_LIMITED_MSG:
                        logger.warning(
                            "Soft rate limit on profile %s, skipping", username
                        )
                        failed.append(username)
                        await asyncio.sleep(_NAV_DELAY)
                        continue

                    # Scrape contact info overlay
                    extracted_contact = await self._extract_overlay(contact_url, section_name="contact_info")
                    pages_visited.append(contact_url)
                    contact_text = extracted_contact.text

                    if contact_text == _RATE_LIMITED_MSG:
                        contact_text = ""  # fall back to empty; parsed fields will be None
```

How can I resolve this? If you propose a fix, please make it concise.

contacts.append(
{
"username": username,
**parsed,
"profile_raw": profile_text,
"contact_info_raw": contact_text,
}
)

except RateLimitError:
logger.warning("Rate limited during contact batch at %s", username)
failed.append(username)
rate_limited = True
break
except Exception as e:
logger.warning("Failed to scrape %s: %s", username, e)
failed.append(username)

# Brief delay between individual profiles
await asyncio.sleep(_NAV_DELAY)

if rate_limited:
break

# Report progress after each chunk
completed = min(chunk_idx + len(chunk), total)
if progress_cb:
await progress_cb(completed, total)

# Pause between chunks (skip after last chunk)
if chunk_idx + chunk_size < total:
logger.info(
"Chunk complete (%d/%d). Pausing %.0fs...",
completed,
total,
chunk_delay,
)
await asyncio.sleep(chunk_delay)

return {
"contacts": contacts,
"total": len(contacts),
"failed": failed,
"rate_limited": rate_limited,
"pages_visited": pages_visited,
}
2 changes: 2 additions & 0 deletions linkedin_mcp_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
SequentialToolExecutionMiddleware,
)
from linkedin_mcp_server.tools.company import register_company_tools
from linkedin_mcp_server.tools.connections import register_connections_tools
from linkedin_mcp_server.tools.job import register_job_tools
from linkedin_mcp_server.tools.person import register_person_tools

Expand Down Expand Up @@ -58,6 +59,7 @@ def create_mcp_server() -> FastMCP:
register_person_tools(mcp)
register_company_tools(mcp)
register_job_tools(mcp)
register_connections_tools(mcp)

# Register session management tool
@mcp.tool(
Expand Down
Loading
Loading