Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 37 additions & 5 deletions linkedin_mcp_server/scraping/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,6 +649,7 @@ async def extract_page(
self,
url: str,
section_name: str,
max_scrolls: int | None = None,
) -> ExtractedSection:
"""Navigate to a URL, scroll to load lazy content, and extract innerText.

Expand All @@ -661,14 +662,14 @@ async def extract_page(
Returns empty string for unexpected non-domain failures (error isolation).
"""
try:
result = await self._extract_page_once(url, section_name)
result = await self._extract_page_once(url, section_name, max_scrolls)
if result.text != _RATE_LIMITED_MSG:
return result

# Retry once after backoff
logger.info("Retrying %s after %.0fs backoff", url, _RATE_LIMIT_RETRY_DELAY)
await asyncio.sleep(_RATE_LIMIT_RETRY_DELAY)
return await self._extract_page_once(url, section_name)
return await self._extract_page_once(url, section_name, max_scrolls)

except LinkedInScraperException:
raise
Expand All @@ -689,6 +690,7 @@ async def _extract_page_once(
self,
url: str,
section_name: str,
max_scrolls: int | None = None,
) -> ExtractedSection:
"""Single attempt to navigate, scroll, and extract innerText."""
await self._navigate_to_page(url)
Expand Down Expand Up @@ -755,11 +757,38 @@ async def _extract_page_once(
except PlaywrightTimeoutError:
logger.debug("Detail section content did not appear on %s", url)

# Detail pages paginate with a "Show more" button inside <main>, not scroll.
# Click it until it disappears or the budget runs out.
if is_details:
max_clicks = max_scrolls if max_scrolls is not None else 5
for i in range(max_clicks):
button = self._page.locator("main button").filter(
has_text=re.compile(r"^Show (more|all)\b", re.IGNORECASE)
)
try:
if await button.count() == 0:
logger.debug("No 'Show more' button after %d clicks", i)
break
target = button.first
if not await target.is_visible():
break
await target.scroll_into_view_if_needed(timeout=2000)
await target.click(timeout=2000)
await asyncio.sleep(1.0)
except PlaywrightTimeoutError:
logger.debug("Show more click timed out after %d clicks", i)
break
except Exception as e:
logger.debug("Show more click failed: %s", e)
break

# Scroll to trigger lazy loading
if is_activity:
await scroll_to_bottom(self._page, pause_time=1.0, max_scrolls=10)
scrolls = max_scrolls if max_scrolls is not None else 10
await scroll_to_bottom(self._page, pause_time=1.0, max_scrolls=scrolls)
else:
await scroll_to_bottom(self._page, pause_time=0.5, max_scrolls=5)
scrolls = max_scrolls if max_scrolls is not None else 5
await scroll_to_bottom(self._page, pause_time=0.5, max_scrolls=scrolls)

# Extract text from main content area
raw_result = await self._extract_root_content(["main"])
Expand Down Expand Up @@ -864,6 +893,7 @@ async def scrape_person(
username: str,
requested: set[str],
callbacks: ProgressCallback | None = None,
max_scrolls: int | None = None,
) -> dict[str, Any]:
"""Scrape a person profile with configurable sections.

Expand Down Expand Up @@ -900,7 +930,9 @@ async def scrape_person(
)
else:
extracted = await self.extract_page(
url, section_name=section_name
url,
section_name=section_name,
max_scrolls=max_scrolls,
)

if extracted.text and extracted.text != _RATE_LIMITED_MSG:
Expand Down
17 changes: 15 additions & 2 deletions linkedin_mcp_server/tools/person.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
"""

import logging
from typing import Any
from typing import Annotated, Any

from fastmcp import Context, FastMCP
from pydantic import Field

from linkedin_mcp_server.callbacks import MCPContextProgressCallback
from linkedin_mcp_server.constants import TOOL_TIMEOUT_SECONDS
Expand All @@ -34,6 +35,7 @@ async def get_person_profile(
linkedin_username: str,
ctx: Context,
sections: str | None = None,
max_scrolls: Annotated[int, Field(ge=1, le=50)] | None = None,
extractor: Any | None = None,
) -> dict[str, Any]:
"""
Expand All @@ -47,6 +49,14 @@ async def get_person_profile(
Available sections: experience, education, interests, honors, languages, certifications, skills, projects, contact_info, posts
Examples: "experience,education", "contact_info", "skills,projects", "honors,languages", "posts"
Default (None) scrapes only the main profile page.
max_scrolls: Maximum pagination attempts per section to load more content.
On detail sections (experience, certifications, skills, etc.) this
is the max number of "Show more" button clicks. On activity/posts
it is the max scroll-to-bottom iterations. Applies to all sections
in this call. Default (None) uses 5 for detail sections and 10 for
posts. Increase when a profile has many items in a section
(e.g., 30+ certifications, max_scrolls=20). To avoid slowing down
other sections, request heavy sections in a separate call.

Returns:
Dict with url, sections (name -> raw text), and optional references.
Expand All @@ -68,7 +78,10 @@ async def get_person_profile(

cb = MCPContextProgressCallback(ctx)
result = await extractor.scrape_person(
linkedin_username, requested, callbacks=cb
linkedin_username,
requested,
callbacks=cb,
max_scrolls=max_scrolls,
)

if unknown:
Expand Down
Loading
Loading