Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Changelog

## [1.14.16] - 2025-01-08

### Added
- Added `include_urls` and `exclude_urls` parameters to search methods
- Available in `search()`, `search_and_contents()`, `find_similar()`, and `find_similar_and_contents()` methods
- Both sync and async versions support these new parameters
- Supports wildcard patterns at the beginning or end of URL patterns (e.g., `*/contact-us/*`, `*.ai`)
- Use `include_urls` to filter results to only URLs matching specified patterns
- Use `exclude_urls` to filter out results with URLs matching specified patterns

### Important Constraints
- `include_urls`/`exclude_urls` cannot be used together with `include_domains`/`exclude_domains` in the same request
- `include_urls` and `exclude_urls` cannot be used together in the same request

### Testing
- Added comprehensive test coverage for URL filtering functionality
- Unit tests for parameter validation in `tests/test_url_filters_unit.py`
- Integration tests requiring API key in `tests/test_search_api.py`
- Tests cover both synchronous and asynchronous implementations

### Examples
- Added `examples/url_filtering_example.py` demonstrating various URL filtering patterns
- Added `examples/company_research_url_filtering.py` showing practical company research workflows

### Example Usage
```python
# Include only contact pages
results = exa.search("AI startup", include_urls=["*/contact-us/*", "*/about/*"])

# Exclude blog and news pages
results = exa.search("machine learning", exclude_urls=["*/blog/*", "*/news/*"])

# Filter LinkedIn profiles
results = exa.find_similar(
"https://www.linkedin.com/in/example/",
include_urls=["www.linkedin.com/in/*"]
)
```
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,15 @@ exa = Exa(api_key="your-api-key")
# search with domain filters
results = exa.search("This is a Exa query:", include_domains=["www.cnn.com", "www.nytimes.com"])

# search with URL pattern filters
results = exa.search("AI law startup", include_urls=["*/contact-us/*", "*/about/*"])

# exclude certain URL patterns
results = exa.search("machine learning", exclude_urls=["*/blog/*", "*/news/*"])

# Note: include_urls/exclude_urls cannot be used together with include_domains/exclude_domains
# Also, include_urls and exclude_urls cannot be used together in the same request

# search and get text contents
results = exa.search_and_contents("This is a Exa query:")

Expand Down
106 changes: 91 additions & 15 deletions exa_py/api.py

Large diffs are not rendered by default.

112 changes: 112 additions & 0 deletions examples/company_research_url_filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
#!/usr/bin/env python3
"""
Example: Using URL filtering for efficient company research.

This example demonstrates how to use include_urls and exclude_urls to gather
specific types of information about companies, such as contact information,
team pages, and official announcements while filtering out noise.
"""

import os
from exa_py import Exa

# Initialize the Exa client
api_key = os.environ.get("EXA_API_KEY")
if not api_key:
raise ValueError("Please set EXA_API_KEY environment variable")

exa = Exa(api_key=api_key)


def research_company(company_name: str):
"""Research a company using targeted URL filtering."""
print(f"\n=== Researching {company_name} ===\n")

# 1. Find official company pages
print("1. Finding official company information:")
print("-" * 50)
official_results = exa.search(
f"{company_name} official website",
num_results=5,
include_urls=["*/about/*", "*/about-us/*", "*/company/*", "*/who-we-are/*"]
)
print(f"Found {len(official_results.results)} official pages")
for result in official_results.results[:3]:
print(f" - {result.title}")
print(f" {result.url}\n")

# 2. Find press releases and news
print("\n2. Finding recent press releases:")
print("-" * 50)
press_results = exa.search_and_contents(
f"{company_name} announcement",
num_results=3,
include_urls=["*/press/*", "*/news/*", "*/blog/*", "*/newsroom/*"],
text={"max_characters": 300}
)
print(f"Found {len(press_results.results)} press releases")
for result in press_results.results:
print(f" • {result.title}")
print(f" {result.url}")
if result.text:
preview = result.text[:150] + "..." if len(result.text) > 150 else result.text
print(f" Preview: {preview}\n")

# 3. Find team information on LinkedIn
print("\n3. Finding team members on LinkedIn:")
team_results = exa.search(
f"{company_name} employees team",
num_results=5,
include_urls=["www.linkedin.com/in/*", "linkedin.com/in/*"]
)
print(f"Found {len(team_results.results)} LinkedIn profiles")
for result in team_results.results[:3]:
print(f" - {result.title}: {result.url}")


def find_competitors(company_name: str, industry: str):
"""Find competitor companies using URL filtering."""
print(f"\n=== Finding competitors of {company_name} in {industry} ===\n")

# Find similar company websites
sample_url = f"https://www.{company_name.lower().replace(' ', '')}.com"

similar_companies = exa.find_similar(
sample_url,
num_results=10,
exclude_urls=["*/blog/*", "*/news/*", "*/wiki/*", "*/linkedin.com/*"]
)

print(f"Found {len(similar_companies.results)} similar company websites:")
for i, result in enumerate(similar_companies.results[:5], 1):
print(f"{i}. {result.title}")
print(f" {result.url}")
print()


def main():
"""Run company research examples."""
print("=== Company Research with URL Filtering ===")
print("This example shows how to efficiently research companies")
print("by filtering URLs to find specific types of information.\n")

# Example companies to research
companies = ["OpenAI", "Anthropic"]

for company in companies:
research_company(company)

# Find competitors example
find_competitors("OpenAI", "AI research")

print("\n=== Summary ===")
print("URL filtering helps you:")
print("• Find official company information (not third-party coverage)")
print("• Locate specific page types (contact, team, careers)")
print("• Exclude irrelevant content (blogs, news, wikis)")
print("• Focus your research on high-value pages")
print("• Save time by getting targeted results")


if __name__ == "__main__":
main()
138 changes: 138 additions & 0 deletions examples/url_filtering_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#!/usr/bin/env python3
"""
Example demonstrating URL filtering with include_urls and exclude_urls parameters.

This example shows how to use the new URL filtering capabilities in Exa API
to refine search results based on URL patterns.

IMPORTANT: When using include_urls or exclude_urls, you cannot use
include_domains or exclude_domains in the same request.
Also, include_urls and exclude_urls cannot be used together in the same request.
"""

import os
from exa_py import Exa

# Initialize the Exa client
api_key = os.environ.get("EXA_API_KEY")
if not api_key:
raise ValueError("Please set EXA_API_KEY environment variable")

exa = Exa(api_key=api_key)

print("=== Exa URL Filtering Examples ===\n")

# Example 1: Find contact pages for AI startups
print("1. Finding contact pages for AI startups:")
print("-" * 50)
results = exa.search(
"AI startup artificial intelligence",
num_results=5,
include_urls=["*/contact/*", "*/contact-us/*", "*/about/contact/*"]
)
print(f"Found {len(results.results)} contact pages:")
for i, result in enumerate(results.results[:3], 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
print()

# Example 2: Exclude blog and news articles
print("2. Searching for technical content, excluding blogs and news:")
print("-" * 50)
results = exa.search(
"machine learning algorithms",
num_results=5,
exclude_urls=["*/blog/*", "*/news/*", "*/press/*"]
)
print(f"Found {len(results.results)} non-blog/news results:")
for i, result in enumerate(results.results[:3], 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
print()

# Example 3: Find LinkedIn profiles
print("3. Finding LinkedIn profiles similar to a tech leader:")
print("-" * 50)
results = exa.find_similar(
"https://www.linkedin.com/in/satyanadella/",
num_results=5,
include_urls=["www.linkedin.com/in/*", "linkedin.com/in/*"]
)
print(f"Found {len(results.results)} LinkedIn profiles:")
for i, result in enumerate(results.results[:3], 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
print()

# Example 4: Filter by domain extension
print("4. Finding only .edu educational resources:")
print("-" * 50)
results = exa.search(
"quantum computing introduction",
num_results=5,
include_urls=["*.edu/*"]
)
print(f"Found {len(results.results)} educational resources:")
for i, result in enumerate(results.results[:3], 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
print()

# Example 5: Finding product pages (using include_urls only)
print("5. Finding product pages:")
print("-" * 50)
results = exa.search(
"best laptop 2024",
num_results=5,
include_urls=["*/products/*", "*/shop/*", "*/store/*", "*/item/*"]
)
print(f"Found {len(results.results)} product pages:")
for i, result in enumerate(results.results[:3], 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
print()

# Example 6: Using with search_and_contents
print("6. Getting content from specific page types:")
print("-" * 50)
results = exa.search_and_contents(
"company mission statement",
num_results=3,
include_urls=["*/about/*", "*/about-us/*", "*/mission/*"],
text={"max_characters": 500}
)
print(f"Found {len(results.results)} about/mission pages with content:")
for i, result in enumerate(results.results, 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
if result.text:
preview = result.text[:200] + "..." if len(result.text) > 200 else result.text
print(f" Preview: {preview}")
print()

# Example 7: Finding similar pages with URL constraints (using include_urls only)
print("7. Finding similar documentation pages:")
print("-" * 50)
results = exa.find_similar_and_contents(
"https://docs.python.org/3/tutorial/",
num_results=3,
include_urls=["*/docs/*", "*/documentation/*", "*/guide/*", "*/tutorial/*"],
text=True
)
print(f"Found {len(results.results)} similar documentation pages:")
for i, result in enumerate(results.results, 1):
print(f"{i}. {result.title}")
print(f" URL: {result.url}")
print()

print("\n=== Advanced Usage Tips ===")
print("1. Wildcards (*) can be used at the beginning or end of patterns")
print("2. Multiple patterns can be specified in a list")
print("3. include_urls and exclude_urls CANNOT be used together in the same request")
print("4. Patterns are case-sensitive")
print("5. IMPORTANT: Cannot use include_urls/exclude_urls with include_domains/exclude_domains")
print("6. Use these filters to:")
print(" - Find specific page types (contact, about, product pages)")
print(" - Filter by domain or subdomain patterns")
print(" - Exclude unwanted content types (use exclude_urls separately)")
print(" - Focus on specific platforms (LinkedIn, GitHub, etc.)")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ in-project = true

[project]
name = "exa-py"
version = "1.14.15"
version = "1.14.16"
description = "Python SDK for Exa API."
readme = "README.md"
requires-python = ">=3.9"
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name="exa_py",
version="1.14.15",
version="1.14.16",
description="Python SDK for Exa API.",
long_description_content_type="text/markdown",
long_description=open("README.md").read(),
Expand Down
Loading
Loading