Skip to content

Commit

Permalink
add proxies_per_source_limit setting
Browse files Browse the repository at this point in the history
  • Loading branch information
monosans committed Jan 10, 2025
1 parent a6045e8 commit 1e56188
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 2 deletions.
6 changes: 6 additions & 0 deletions config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ timeout = 5
# The number of seconds to wait for the proxies to be downloaded from the source.
source_timeout = 15

# Maximum number of proxies from a single source.
# If the source has more proxies than this value,
# then the source has a huge number of low-quality proxies and can be ignored.
# Set to 0 to disable this limit.
proxies_per_source_limit = 100000

# Maximum concurrent connections.
# Don't be in a hurry to set high values.
# Make sure you have enough RAM first, gradually increasing the default value.
Expand Down
9 changes: 7 additions & 2 deletions proxy_scraper_checker/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ async def scrape_one(
progress: Progress,
proto: ProxyType,
session: ClientSession,
settings: Settings,
source: str,
storage: ProxyStorage,
task: TaskID,
Expand Down Expand Up @@ -59,10 +60,13 @@ async def scrape_one(
)
else:
counter.incr()
proxies = PROXY_REGEX.findall(text)
proxies = tuple(PROXY_REGEX.finditer(text))
if not proxies:
_logger.warning("%s | No proxies found", source)
elif len(proxies) <= 100_000: # noqa: PLR2004
elif (
settings.proxies_per_source_limit
and len(proxies) > settings.proxies_per_source_limit
):
_logger.warning(
"%s has too many proxies (%d), skipping", source, len(proxies)
)
Expand Down Expand Up @@ -112,6 +116,7 @@ async def scrape_all(
progress=progress,
proto=proto,
session=session,
settings=settings,
source=source,
storage=storage,
task=progress_tasks[proto],
Expand Down
4 changes: 4 additions & 0 deletions proxy_scraper_checker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,9 @@ class Settings:
)
output_path: Path = attrs.field(converter=Path)
output_txt: bool = attrs.field(validator=attrs.validators.instance_of(bool))
proxies_per_source_limit: int = attrs.field(
validator=attrs.validators.ge(0)
)
real_ip: str | None = attrs.field(
validator=attrs.validators.optional(attrs.validators.instance_of(str))
)
Expand Down Expand Up @@ -321,6 +324,7 @@ async def from_mapping(
output_json=cfg["output"]["json"],
output_path=output_path,
output_txt=cfg["output"]["txt"],
proxies_per_source_limit=cfg["proxies_per_source_limit"],
real_ip=real_ip,
semaphore=cfg["max_connections"],
sort_by_speed=cfg["sort_by_speed"],
Expand Down

0 comments on commit 1e56188

Please sign in to comment.