From 5e13063540309068e2068314f301f884c411b57d Mon Sep 17 00:00:00 2001 From: IfThingsThenStuff <105675059+IfThingsThenStuff@users.noreply.github.com> Date: Wed, 25 Feb 2026 22:13:14 -0500 Subject: [PATCH 1/4] feat(saved-jobs): fix pagination, embed job IDs, and add progress reporting - Fix wait_for_function positional arg bug (arg= keyword required) - Switch pagination from broken "Next" button to numbered page buttons (button[aria-label="Page N"]) which reliably triggers content updates - Replace arbitrary asyncio.sleep() calls with DOM-based waiting via wait_for_function to detect new job links - Embed job IDs summary in section text so LLMs always surface them - Add on_progress callback for per-page progress reporting Co-Authored-By: Claude Opus 4.6 --- linkedin_mcp_server/scraping/extractor.py | 126 ++++++++++++++++++++++ linkedin_mcp_server/tools/job.py | 42 ++++++++ tests/test_scraping.py | 101 +++++++++++++++++ tests/test_tools.py | 25 +++++ 4 files changed, 294 insertions(+) diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 2a34a397..01bcc654 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -3,6 +3,7 @@ import asyncio import logging import re +from collections.abc import Awaitable, Callable from typing import Any from urllib.parse import quote_plus @@ -384,6 +385,131 @@ async def search_jobs( "sections_requested": ["search_results"], } + _EXTRACT_JOB_IDS_JS = """() => { + const ids = []; + document.querySelectorAll('a[href*="/jobs/view/"]').forEach(a => { + const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/); + if (match && !ids.includes(match[1])) ids.push(match[1]); + }); + return ids; + }""" + + _EXTRACT_MAIN_TEXT_JS = """() => { + const main = document.querySelector('main'); + return main ? main.innerText : document.body.innerText; + }""" + + async def scrape_saved_jobs( + self, + max_pages: int = 10, + on_progress: Callable[[int, int, str], Awaitable[None]] | None = None, + ) -> dict[str, Any]: + """Scrape the user's saved/bookmarked jobs from the jobs tracker page. + + Automatically paginates through all pages by clicking the "Next" button. + Extracts job IDs from link hrefs (``/jobs/view//``) since they are + not present in the page's innerText. + + Args: + max_pages: Safety cap on pages to scrape (default 10). + + Returns: + {url, sections: {name: text}, pages_visited, sections_requested, + job_ids: list[str]} + """ + url = "https://www.linkedin.com/jobs-tracker/" + text = await self.extract_page(url) + + all_text_parts: list[str] = [] + all_job_ids: list[str] = [] + + if text: + all_text_parts.append(text) + + # Collect job IDs from page 1. + page_ids: list[str] = await self._page.evaluate(self._EXTRACT_JOB_IDS_JS) + all_job_ids.extend(page_ids) + logger.info("Page 1: found %d job IDs", len(page_ids)) + + if on_progress: + await on_progress(1, max_pages, "Fetched saved jobs page 1") + + # Paginate through remaining pages using numbered page buttons. + for page_num in range(2, max_pages + 1): + page_btn = self._page.locator(f'button[aria-label="Page {page_num}"]') + if not await page_btn.count(): + logger.info( + "No page %d button found — stopping at page %d", + page_num, + page_num - 1, + ) + break + + logger.info("Navigating to saved jobs page %d", page_num) + prev_ids = set(all_job_ids) + await page_btn.scroll_into_view_if_needed() + await page_btn.click() + + # Wait for the DOM to reflect new job links (no sleep needed). + try: + await self._page.wait_for_function( + """(prevIds) => { + const links = document.querySelectorAll('a[href*="/jobs/view/"]'); + for (const a of links) { + const m = a.href.match(/\\/jobs\\/view\\/(\\d+)/); + if (m && !prevIds.includes(m[1])) return true; + } + return false; + }""", + arg=list(prev_ids), + timeout=15000, + ) + except PlaywrightTimeoutError: + logger.info("No new job IDs appeared on page %d — stopping", page_num) + break + + await scroll_to_bottom(self._page, pause_time=0.5, max_scrolls=3) + + raw = await self._page.evaluate(self._EXTRACT_MAIN_TEXT_JS) + if raw: + cleaned = strip_linkedin_noise(raw) + if cleaned: + all_text_parts.append(cleaned) + + page_ids = await self._page.evaluate(self._EXTRACT_JOB_IDS_JS) + new_ids = [jid for jid in page_ids if jid not in prev_ids] + logger.info("Page %d: found %d new job IDs", page_num, len(new_ids)) + if not new_ids: + break + all_job_ids.extend(new_ids) + + if on_progress: + await on_progress( + page_num, max_pages, f"Fetched saved jobs page {page_num}" + ) + + # Append a summary of job IDs so they are always visible in the text. + id_summary = "\n".join( + f"- Job ID: {jid} (https://www.linkedin.com/jobs/view/{jid}/)" + for jid in all_job_ids + ) + if id_summary: + all_text_parts.append(f"--- Saved Job IDs ---\n{id_summary}") + + sections: dict[str, str] = {} + if all_text_parts: + sections["saved_jobs"] = "\n\n".join(all_text_parts) + + logger.info("Total saved jobs found: %d across all pages", len(all_job_ids)) + + return { + "url": url, + "sections": sections, + "pages_visited": [url], + "sections_requested": ["saved_jobs"], + "job_ids": all_job_ids, + } + async def search_people( self, keywords: str, diff --git a/linkedin_mcp_server/tools/job.py b/linkedin_mcp_server/tools/job.py index 3eadf552..faf6ccb5 100644 --- a/linkedin_mcp_server/tools/job.py +++ b/linkedin_mcp_server/tools/job.py @@ -64,6 +64,48 @@ async def get_job_details(job_id: str, ctx: Context) -> dict[str, Any]: except Exception as e: return handle_tool_error(e, "get_job_details") + @mcp.tool( + annotations=ToolAnnotations( + title="Get Saved Jobs", + readOnlyHint=True, + destructiveHint=False, + openWorldHint=True, + ) + ) + async def get_saved_jobs(ctx: Context) -> dict[str, Any]: + """ + Get the user's saved/bookmarked jobs from LinkedIn's job tracker. + + Returns: + Dict with url, sections (name -> raw text), pages_visited, sections_requested, + and job_ids (list of LinkedIn job ID strings). + The LLM should parse the raw text to extract saved job listings. + """ + try: + await ensure_authenticated() + + logger.info("Scraping saved jobs") + + browser = await get_or_create_browser() + extractor = LinkedInExtractor(browser.page) + + await ctx.report_progress( + progress=0, total=100, message="Fetching saved jobs" + ) + + async def _report(page: int, total: int, msg: str) -> None: + pct = min(int(page / total * 100), 99) + await ctx.report_progress(progress=pct, total=100, message=msg) + + result = await extractor.scrape_saved_jobs(on_progress=_report) + + await ctx.report_progress(progress=100, total=100, message="Complete") + + return result + + except Exception as e: + return handle_tool_error(e, "get_saved_jobs") + @mcp.tool( annotations=ToolAnnotations( title="Search Jobs", diff --git a/tests/test_scraping.py b/tests/test_scraping.py index 7493e153..f6c84a17 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -416,6 +416,107 @@ async def test_search_jobs(self, mock_page): assert result["sections_requested"] == ["search_results"] +class TestScrapeSavedJobs: + async def test_scrape_saved_jobs_single_page(self, mock_page): + """Single page of results — no Next button.""" + mock_page.evaluate = AsyncMock(return_value=["111", "222"]) + mock_next = MagicMock() + mock_next.count = AsyncMock(return_value=0) + mock_page.locator = MagicMock(return_value=mock_next) + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Saved Job 1\nSaved Job 2", + ): + result = await extractor.scrape_saved_jobs() + + assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + assert "saved_jobs" in result["sections"] + assert result["sections_requested"] == ["saved_jobs"] + assert result["job_ids"] == ["111", "222"] + assert "Job ID: 111" in result["sections"]["saved_jobs"] + assert "Job ID: 222" in result["sections"]["saved_jobs"] + + async def test_scrape_saved_jobs_paginates(self, mock_page): + """Clicks Next and collects job IDs from page 2.""" + # Page 1 returns IDs 111, 222; page 2 returns 111, 222, 333, 444 + call_count = 0 + + async def evaluate_side_effect(js, *args): + nonlocal call_count + call_count += 1 + if "jobs/view" in js: + # First call: page 1 IDs; second call: page 2 IDs + if call_count <= 2: + return ["111", "222"] + return ["333", "444"] + if "innerText" in js: + return "Page 2 jobs" + return None + + mock_page.evaluate = AsyncMock(side_effect=evaluate_side_effect) + + # Page button exists for page 2, not for page 3 + page_btn_click_count = 0 + mock_page_btn = MagicMock() + + async def page_btn_count(): + return 1 if page_btn_click_count == 0 else 0 + + mock_page_btn.count = AsyncMock(side_effect=page_btn_count) + mock_page_btn.scroll_into_view_if_needed = AsyncMock() + + async def page_btn_click(): + nonlocal page_btn_click_count + page_btn_click_count += 1 + + mock_page_btn.click = AsyncMock(side_effect=page_btn_click) + mock_page.locator = MagicMock(return_value=mock_page_btn) + mock_page.wait_for_function = AsyncMock() + + extractor = LinkedInExtractor(mock_page) + with ( + patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Page 1 jobs", + ), + patch( + "linkedin_mcp_server.scraping.extractor.scroll_to_bottom", + new_callable=AsyncMock, + ), + ): + result = await extractor.scrape_saved_jobs() + + assert result["job_ids"] == ["111", "222", "333", "444"] + assert "Page 1 jobs" in result["sections"]["saved_jobs"] + assert "Page 2 jobs" in result["sections"]["saved_jobs"] + for jid in ["111", "222", "333", "444"]: + assert f"Job ID: {jid}" in result["sections"]["saved_jobs"] + + async def test_scrape_saved_jobs_empty(self, mock_page): + mock_page.evaluate = AsyncMock(return_value=[]) + mock_next = MagicMock() + mock_next.count = AsyncMock(return_value=0) + mock_page.locator = MagicMock(return_value=mock_next) + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="", + ): + result = await extractor.scrape_saved_jobs() + + assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + assert result["sections"] == {} + assert result["sections_requested"] == ["saved_jobs"] + assert result["job_ids"] == [] + + class TestStripLinkedInNoise: def test_strips_footer(self): text = "Bill Gates\nChair, Gates Foundation\n\nAbout\nAccessibility\nTalent Solutions\nCareers" diff --git a/tests/test_tools.py b/tests/test_tools.py index 9f0f1b7f..b2277d41 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -41,6 +41,7 @@ def _make_mock_extractor(scrape_result: dict) -> MagicMock: mock.scrape_job = AsyncMock(return_value=scrape_result) mock.search_jobs = AsyncMock(return_value=scrape_result) mock.search_people = AsyncMock(return_value=scrape_result) + mock.scrape_saved_jobs = AsyncMock(return_value=scrape_result) mock.extract_page = AsyncMock(return_value="some text") return mock @@ -223,6 +224,30 @@ async def test_get_job_details(self, mock_context, patch_tool_deps, monkeypatch) result = await tool_fn("12345", mock_context) assert "job_posting" in result["sections"] + async def test_get_saved_jobs(self, mock_context, patch_tool_deps, monkeypatch): + expected = { + "url": "https://www.linkedin.com/jobs-tracker/", + "sections": {"saved_jobs": "Saved Job 1\nSaved Job 2"}, + "pages_visited": ["https://www.linkedin.com/jobs-tracker/"], + "sections_requested": ["saved_jobs"], + "job_ids": ["111", "222"], + } + mock_extractor = _make_mock_extractor(expected) + monkeypatch.setattr( + "linkedin_mcp_server.tools.job.LinkedInExtractor", + lambda *a, **kw: mock_extractor, + ) + + from linkedin_mcp_server.tools.job import register_job_tools + + mcp = FastMCP("test") + register_job_tools(mcp) + + tool_fn = await get_tool_fn(mcp, "get_saved_jobs") + result = await tool_fn(mock_context) + assert "saved_jobs" in result["sections"] + assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + async def test_search_jobs(self, mock_context, patch_tool_deps, monkeypatch): expected = { "url": "https://www.linkedin.com/jobs/search/?keywords=python", From e7217f0d5cddea5ccf524f7647a703ae4a6bd275 Mon Sep 17 00:00:00 2001 From: IfThingsThenStuff <105675059+IfThingsThenStuff@users.noreply.github.com> Date: Wed, 25 Feb 2026 22:19:35 -0500 Subject: [PATCH 2/4] fix(saved-jobs): use actual page count for progress reporting Detect total pages from pagination buttons on the page instead of using max_pages (10), so progress reports reflect reality (1/2, 2/2 instead of 1/10, 2/10). Co-Authored-By: Claude Opus 4.6 --- linkedin_mcp_server/scraping/extractor.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 01bcc654..435a9569 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -431,8 +431,13 @@ async def scrape_saved_jobs( all_job_ids.extend(page_ids) logger.info("Page 1: found %d job IDs", len(page_ids)) + # Determine total pages from pagination buttons (10 jobs per page). + page_buttons = self._page.locator('button[aria-label^="Page "]') + total_pages = max(await page_buttons.count(), 1) + logger.info("Total pages detected: %d", total_pages) + if on_progress: - await on_progress(1, max_pages, "Fetched saved jobs page 1") + await on_progress(1, total_pages, "Fetched saved jobs page 1") # Paginate through remaining pages using numbered page buttons. for page_num in range(2, max_pages + 1): @@ -485,7 +490,7 @@ async def scrape_saved_jobs( if on_progress: await on_progress( - page_num, max_pages, f"Fetched saved jobs page {page_num}" + page_num, total_pages, f"Fetched saved jobs page {page_num}" ) # Append a summary of job IDs so they are always visible in the text. From 63f455fd33d030a7fe72067186b1e22c630ec898 Mon Sep 17 00:00:00 2001 From: IfThingsThenStuff <105675059+IfThingsThenStuff@users.noreply.github.com> Date: Wed, 25 Feb 2026 22:47:45 -0500 Subject: [PATCH 3/4] fix(saved-jobs): cap total_pages, add nav delay, use Set for O(1) lookups, and add tests Address review findings: cap total_pages with max_pages to fix misleading progress percentages, add _NAV_DELAY between page clicks for rate-limit safety, convert JS prevIds.includes() to Set.has() for O(1) lookups, guard division by zero in _report, fix docstring inaccuracies, and add 5 targeted tests covering progress callbacks, timeout graceful stop, max_pages cap, and session expired error handling. Co-Authored-By: Claude Opus 4.6 --- linkedin_mcp_server/scraping/extractor.py | 14 +++-- linkedin_mcp_server/tools/job.py | 2 +- tests/test_scraping.py | 77 +++++++++++++++++++++-- tests/test_tools.py | 17 +++++ 4 files changed, 99 insertions(+), 11 deletions(-) diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index 435a9569..cb3fd09a 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -406,12 +406,14 @@ async def scrape_saved_jobs( ) -> dict[str, Any]: """Scrape the user's saved/bookmarked jobs from the jobs tracker page. - Automatically paginates through all pages by clicking the "Next" button. + Automatically paginates through all pages using numbered page buttons. Extracts job IDs from link hrefs (``/jobs/view//``) since they are not present in the page's innerText. Args: max_pages: Safety cap on pages to scrape (default 10). + on_progress: Optional async callback ``(page, total, message)`` + invoked after each page is scraped. Returns: {url, sections: {name: text}, pages_visited, sections_requested, @@ -433,7 +435,7 @@ async def scrape_saved_jobs( # Determine total pages from pagination buttons (10 jobs per page). page_buttons = self._page.locator('button[aria-label^="Page "]') - total_pages = max(await page_buttons.count(), 1) + total_pages = min(max(await page_buttons.count(), 1), max_pages) logger.info("Total pages detected: %d", total_pages) if on_progress: @@ -454,15 +456,17 @@ async def scrape_saved_jobs( prev_ids = set(all_job_ids) await page_btn.scroll_into_view_if_needed() await page_btn.click() + await asyncio.sleep(_NAV_DELAY) - # Wait for the DOM to reflect new job links (no sleep needed). + # Wait for the DOM to reflect new job links. try: await self._page.wait_for_function( """(prevIds) => { + const prev = new Set(prevIds); const links = document.querySelectorAll('a[href*="/jobs/view/"]'); for (const a of links) { - const m = a.href.match(/\\/jobs\\/view\\/(\\d+)/); - if (m && !prevIds.includes(m[1])) return true; + const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/); + if (match && !prev.has(match[1])) return true; } return false; }""", diff --git a/linkedin_mcp_server/tools/job.py b/linkedin_mcp_server/tools/job.py index faf6ccb5..a00ad5b0 100644 --- a/linkedin_mcp_server/tools/job.py +++ b/linkedin_mcp_server/tools/job.py @@ -94,7 +94,7 @@ async def get_saved_jobs(ctx: Context) -> dict[str, Any]: ) async def _report(page: int, total: int, msg: str) -> None: - pct = min(int(page / total * 100), 99) + pct = min(int(page / max(total, 1) * 100), 99) await ctx.report_progress(progress=pct, total=100, message=msg) result = await extractor.scrape_saved_jobs(on_progress=_report) diff --git a/tests/test_scraping.py b/tests/test_scraping.py index f6c84a17..32a8cc5d 100644 --- a/tests/test_scraping.py +++ b/tests/test_scraping.py @@ -418,11 +418,12 @@ async def test_search_jobs(self, mock_page): class TestScrapeSavedJobs: async def test_scrape_saved_jobs_single_page(self, mock_page): - """Single page of results — no Next button.""" + """Single page of results — no Next button. Progress callback fires.""" mock_page.evaluate = AsyncMock(return_value=["111", "222"]) mock_next = MagicMock() mock_next.count = AsyncMock(return_value=0) mock_page.locator = MagicMock(return_value=mock_next) + on_progress = AsyncMock() extractor = LinkedInExtractor(mock_page) with patch.object( extractor, @@ -430,7 +431,7 @@ async def test_scrape_saved_jobs_single_page(self, mock_page): new_callable=AsyncMock, return_value="Saved Job 1\nSaved Job 2", ): - result = await extractor.scrape_saved_jobs() + result = await extractor.scrape_saved_jobs(on_progress=on_progress) assert result["url"] == "https://www.linkedin.com/jobs-tracker/" assert "saved_jobs" in result["sections"] @@ -438,10 +439,11 @@ async def test_scrape_saved_jobs_single_page(self, mock_page): assert result["job_ids"] == ["111", "222"] assert "Job ID: 111" in result["sections"]["saved_jobs"] assert "Job ID: 222" in result["sections"]["saved_jobs"] + on_progress.assert_awaited_once_with(1, 1, "Fetched saved jobs page 1") async def test_scrape_saved_jobs_paginates(self, mock_page): - """Clicks Next and collects job IDs from page 2.""" - # Page 1 returns IDs 111, 222; page 2 returns 111, 222, 333, 444 + """Clicks page buttons, collects IDs, fires progress, caps total_pages.""" + # Page 1 returns IDs 111, 222; page 2 returns 333, 444 call_count = 0 async def evaluate_side_effect(js, *args): @@ -475,6 +477,7 @@ async def page_btn_click(): mock_page_btn.click = AsyncMock(side_effect=page_btn_click) mock_page.locator = MagicMock(return_value=mock_page_btn) mock_page.wait_for_function = AsyncMock() + on_progress = AsyncMock() extractor = LinkedInExtractor(mock_page) with ( @@ -488,14 +491,78 @@ async def page_btn_click(): "linkedin_mcp_server.scraping.extractor.scroll_to_bottom", new_callable=AsyncMock, ), + patch( + "linkedin_mcp_server.scraping.extractor.asyncio.sleep", + new_callable=AsyncMock, + ), ): - result = await extractor.scrape_saved_jobs() + result = await extractor.scrape_saved_jobs(on_progress=on_progress) assert result["job_ids"] == ["111", "222", "333", "444"] assert "Page 1 jobs" in result["sections"]["saved_jobs"] assert "Page 2 jobs" in result["sections"]["saved_jobs"] for jid in ["111", "222", "333", "444"]: assert f"Job ID: {jid}" in result["sections"]["saved_jobs"] + # Progress was reported for both pages + assert on_progress.await_count == 2 + + async def test_scrape_saved_jobs_timeout_stops_gracefully(self, mock_page): + """PlaywrightTimeoutError on page 2 returns page 1 results only.""" + from patchright.async_api import TimeoutError as PlaywrightTimeoutError + + mock_page.evaluate = AsyncMock(return_value=["111", "222"]) + + mock_page_btn = MagicMock() + mock_page_btn.count = AsyncMock(return_value=1) + mock_page_btn.scroll_into_view_if_needed = AsyncMock() + mock_page_btn.click = AsyncMock() + mock_page.locator = MagicMock(return_value=mock_page_btn) + mock_page.wait_for_function = AsyncMock( + side_effect=PlaywrightTimeoutError("Timeout") + ) + + extractor = LinkedInExtractor(mock_page) + with ( + patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Page 1 jobs", + ), + patch( + "linkedin_mcp_server.scraping.extractor.asyncio.sleep", + new_callable=AsyncMock, + ), + ): + result = await extractor.scrape_saved_jobs() + + assert result["job_ids"] == ["111", "222"] + assert "Job ID: 111" in result["sections"]["saved_jobs"] + assert "Job ID: 222" in result["sections"]["saved_jobs"] + + async def test_scrape_saved_jobs_stops_at_max_pages_despite_more_buttons( + self, mock_page + ): + """max_pages=1 stops after page 1 even if more buttons exist.""" + mock_page.evaluate = AsyncMock(return_value=["111", "222"]) + + # Simulate page buttons existing (count=3) but max_pages=1 + mock_page_btn = MagicMock() + mock_page_btn.count = AsyncMock(return_value=3) + mock_page.locator = MagicMock(return_value=mock_page_btn) + + extractor = LinkedInExtractor(mock_page) + with patch.object( + extractor, + "extract_page", + new_callable=AsyncMock, + return_value="Page 1 jobs", + ): + result = await extractor.scrape_saved_jobs(max_pages=1) + + assert result["job_ids"] == ["111", "222"] + # click should never have been called (loop range(2, 2) is empty) + mock_page_btn.click.assert_not_called() async def test_scrape_saved_jobs_empty(self, mock_page): mock_page.evaluate = AsyncMock(return_value=[]) diff --git a/tests/test_tools.py b/tests/test_tools.py index b2277d41..26a33e88 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -248,6 +248,23 @@ async def test_get_saved_jobs(self, mock_context, patch_tool_deps, monkeypatch): assert "saved_jobs" in result["sections"] assert result["url"] == "https://www.linkedin.com/jobs-tracker/" + async def test_get_saved_jobs_error(self, mock_context, monkeypatch): + from linkedin_mcp_server.exceptions import SessionExpiredError + + monkeypatch.setattr( + "linkedin_mcp_server.tools.job.ensure_authenticated", + AsyncMock(side_effect=SessionExpiredError()), + ) + + from linkedin_mcp_server.tools.job import register_job_tools + + mcp = FastMCP("test") + register_job_tools(mcp) + + tool_fn = await get_tool_fn(mcp, "get_saved_jobs") + result = await tool_fn(mock_context) + assert result["error"] == "session_expired" + async def test_search_jobs(self, mock_context, patch_tool_deps, monkeypatch): expected = { "url": "https://www.linkedin.com/jobs/search/?keywords=python", From 5e6871791a75fde5d5f407536d8858df16c0f102 Mon Sep 17 00:00:00 2001 From: IfThingsThenStuff <105675059+IfThingsThenStuff@users.noreply.github.com> Date: Wed, 25 Feb 2026 23:12:39 -0500 Subject: [PATCH 4/4] docs(saved-jobs): add docs, expose max_pages, use Set in JS dedup Address Greptile review: use Set for O(1) dedup in _EXTRACT_JOB_IDS_JS, expose max_pages parameter on get_saved_jobs MCP tool, and document the new tool in AGENTS.md, README.md, and docs/docker-hub.md. Co-Authored-By: Claude Opus 4.6 --- AGENTS.md | 1 + README.md | 1 + docs/docker-hub.md | 1 + linkedin_mcp_server/scraping/extractor.py | 3 ++- linkedin_mcp_server/tools/job.py | 11 ++++++++--- 5 files changed, 13 insertions(+), 4 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 94d20690..1317f588 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -60,6 +60,7 @@ This is a **LinkedIn MCP (Model Context Protocol) Server** that enables AI assis | `get_company_posts` | Get recent posts from company feed | | `get_job_details` | Get job posting details | | `search_jobs` | Search jobs by keywords and location | +| `get_saved_jobs` | Get saved/bookmarked jobs from the job tracker (paginated, optional `max_pages`) | | `close_session` | Close browser session and clean up resources | | `search_people` | Search for people by keywords and location | diff --git a/README.md b/README.md index 6d082455..6c00e116 100644 --- a/README.md +++ b/README.md @@ -46,6 +46,7 @@ What has Anthropic been posting about recently? https://www.linkedin.com/company | `search_jobs` | Search for jobs with keywords and location filters | Working | | `search_people` | Search for people by keywords and location | Working | | `get_job_details` | Get detailed information about a specific job posting | Working | +| `get_saved_jobs` | Get saved/bookmarked jobs from your LinkedIn job tracker | Working | | `close_session` | Close browser session and clean up resources | Working | > [!IMPORTANT] diff --git a/docs/docker-hub.md b/docs/docker-hub.md index e122abc5..a4c21a94 100644 --- a/docs/docker-hub.md +++ b/docs/docker-hub.md @@ -8,6 +8,7 @@ A Model Context Protocol (MCP) server that connects AI assistants to LinkedIn. A - **Company Profiles**: Extract comprehensive company data - **Job Details**: Retrieve job posting information - **Job Search**: Search for jobs with keywords and location filters +- **Saved Jobs**: Get saved/bookmarked jobs from your LinkedIn job tracker - **People Search**: Search for people by keywords and location - **Company Posts**: Get recent posts from a company's LinkedIn feed diff --git a/linkedin_mcp_server/scraping/extractor.py b/linkedin_mcp_server/scraping/extractor.py index cb3fd09a..92842144 100644 --- a/linkedin_mcp_server/scraping/extractor.py +++ b/linkedin_mcp_server/scraping/extractor.py @@ -386,10 +386,11 @@ async def search_jobs( } _EXTRACT_JOB_IDS_JS = """() => { + const seen = new Set(); const ids = []; document.querySelectorAll('a[href*="/jobs/view/"]').forEach(a => { const match = a.href.match(/\\/jobs\\/view\\/(\\d+)/); - if (match && !ids.includes(match[1])) ids.push(match[1]); + if (match && !seen.has(match[1])) { seen.add(match[1]); ids.push(match[1]); } }); return ids; }""" diff --git a/linkedin_mcp_server/tools/job.py b/linkedin_mcp_server/tools/job.py index a00ad5b0..b45e4a51 100644 --- a/linkedin_mcp_server/tools/job.py +++ b/linkedin_mcp_server/tools/job.py @@ -72,10 +72,13 @@ async def get_job_details(job_id: str, ctx: Context) -> dict[str, Any]: openWorldHint=True, ) ) - async def get_saved_jobs(ctx: Context) -> dict[str, Any]: + async def get_saved_jobs(ctx: Context, max_pages: int = 10) -> dict[str, Any]: """ Get the user's saved/bookmarked jobs from LinkedIn's job tracker. + Args: + max_pages: Maximum number of pages to scrape (default 10, ~10 jobs/page). + Returns: Dict with url, sections (name -> raw text), pages_visited, sections_requested, and job_ids (list of LinkedIn job ID strings). @@ -84,7 +87,7 @@ async def get_saved_jobs(ctx: Context) -> dict[str, Any]: try: await ensure_authenticated() - logger.info("Scraping saved jobs") + logger.info("Scraping saved jobs (max_pages=%d)", max_pages) browser = await get_or_create_browser() extractor = LinkedInExtractor(browser.page) @@ -97,7 +100,9 @@ async def _report(page: int, total: int, msg: str) -> None: pct = min(int(page / max(total, 1) * 100), 99) await ctx.report_progress(progress=pct, total=100, message=msg) - result = await extractor.scrape_saved_jobs(on_progress=_report) + result = await extractor.scrape_saved_jobs( + max_pages=max_pages, on_progress=_report + ) await ctx.report_progress(progress=100, total=100, message="Complete")