diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 000000000..8a7f725b0 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,42 @@ +name: CI + +on: + pull_request: + push: + branches: [ master ] + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + - run: pip install ruff==0.6.9 + - run: ruff check . + + tests: + strategy: + matrix: + python-version: ['3.9','3.10','3.11','3.12'] + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + cache: 'pip' + - run: pip install -e .[dev] + - run: pytest -q -k "not online" + + diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 000000000..fe8a478ed --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,16 @@ +name: Docker Build + +on: + push: + branches: [ master, rc ] + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - run: docker build -t sherlock:test . + + diff --git a/.github/workflows/nightly-online.yml b/.github/workflows/nightly-online.yml new file mode 100644 index 000000000..b5f8ef2f4 --- /dev/null +++ b/.github/workflows/nightly-online.yml @@ -0,0 +1,22 @@ +name: Nightly Online Tests + +on: + schedule: + - cron: '0 3 * * *' + workflow_dispatch: + +jobs: + online: + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + cache: 'pip' + - run: pip install -e .[dev] + - run: pytest -q -m online --maxfail=1 + + diff --git a/docs/README.md b/docs/README.md index af9011092..bc6e3413d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -48,6 +48,15 @@ sherlock user1 user2 user3 ``` Accounts found will be stored in an individual text file with the corresponding username (e.g ```user123.txt```). +Use `--no-txt` to skip creating the `.txt` file. + +### Output options + +- **Disable txt output**: `--no-txt` +- **Write to a specific file (single user)**: `--output FILE` +- **Write per-user files into a folder (multiple users)**: `--folderoutput DIR` +- **Export CSV**: `--csv` (creates `username.csv`) +- **Export Excel**: `--xlsx` (creates `username.xlsx`) ```console $ sherlock --help @@ -55,7 +64,7 @@ usage: sherlock [-h] [--version] [--verbose] [--folderoutput FOLDEROUTPUT] [--output OUTPUT] [--tor] [--unique-tor] [--csv] [--xlsx] [--site SITE_NAME] [--proxy PROXY_URL] [--json JSON_FILE] [--timeout TIMEOUT] [--print-all] [--print-found] [--no-color] - [--browse] [--local] [--nsfw] + [--browse] [--local] [--nsfw] [--no-txt] [--ignore-exclusions] USERNAMES [USERNAMES ...] Sherlock: Find Usernames Across Social Networks (Version 0.14.3) @@ -96,6 +105,8 @@ optional arguments: --browse, -b Browse to all results on default browser. --local, -l Force the use of the local data.json file. --nsfw Include checking of NSFW sites from default list. + --no-txt Disable creation of a txt file + --ignore-exclusions Ignore upstream exclusions (may return more false positives) ``` ## Apify Actor Usage [![Sherlock Actor](https://apify.com/actor-badge?actor=netmilk/sherlock)](https://apify.com/netmilk/sherlock?fpr=sherlock) diff --git a/sherlock_project/sherlock.py b/sherlock_project/sherlock.py index 250175a57..c86bbbbb0 100644 --- a/sherlock_project/sherlock.py +++ b/sherlock_project/sherlock.py @@ -18,7 +18,6 @@ import csv import signal -import pandas as pd import os import re from argparse import ArgumentParser, RawDescriptionHelpFormatter @@ -27,7 +26,9 @@ from typing import Optional import requests +from requests.adapters import HTTPAdapter from requests_futures.sessions import FuturesSession +from urllib3.util.retry import Retry from sherlock_project.__init__ import ( __longname__, @@ -110,6 +111,35 @@ def response_time(resp, *args, **kwargs): ) +def _mount_session_with_retries(session: requests.Session) -> requests.Session: + """Configure retries, backoff, and connection pooling on a requests session.""" + retry = Retry( + total=3, + backoff_factor=0.5, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["HEAD", "GET", "OPTIONS", "POST", "PUT"], + raise_on_status=False, + ) + adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100, max_retries=retry) + session.mount("http://", adapter) + session.mount("https://", adapter) + return session + + +def _is_valid_proxy_url(proxy_url: str) -> bool: + """Basic validation for proxy URLs. + + Accept common schemes and require a netloc. + """ + from urllib.parse import urlparse + + parsed = urlparse(proxy_url) + if parsed.scheme not in {"http", "https", "socks5", "socks5h", "socks4"}: + return False + if not parsed.netloc: + return False + return True + def get_response(request_future, error_type, social_network): # Default for Response object if some failure occurs. response = None @@ -237,6 +267,9 @@ def sherlock( underlying_session = requests.session() underlying_request = requests.Request() + # Mount retries/connection pooling for robustness and performance + underlying_session = _mount_session_with_retries(underlying_session) + # Limit number of workers to 20. # This is probably vastly overkill. if len(site_data) >= 20: @@ -756,12 +789,15 @@ def main(): print(f"A problem occurred while checking for an update: {error}") # Argument check - # TODO regex check on args.proxy if args.tor and (args.proxy is not None): raise Exception("Tor and Proxy cannot be set at the same time.") # Make prompts if args.proxy is not None: + if not _is_valid_proxy_url(args.proxy): + raise ArgumentTypeError( + f"Invalid proxy URL: {args.proxy}. Expected scheme://host:port with scheme in http, https, socks4, socks5, socks5h." + ) print("Using the proxy: " + args.proxy) if args.tor or args.unique_tor: @@ -942,6 +978,8 @@ def main(): ] ) if args.xlsx: + # Lazy import to reduce startup time and optional dependency overhead + import pandas as pd usernames = [] names = [] url_main = [] diff --git a/tests/test_ux.py b/tests/test_ux.py index 3c62463b5..600afa2af 100644 --- a/tests/test_ux.py +++ b/tests/test_ux.py @@ -33,6 +33,25 @@ def test_wildcard_username_expansion(): assert sherlock.multiple_usernames('test{?}test') == ["test_test" , "test-test" , "test.test"] +def test_no_txt_flag(tmp_path): + """Ensure that --no-txt prevents creating the txt output file.""" + username = "totallyfakeuserforunittest" + # Run in a temp working directory to avoid polluting local repo + import os + cwd = os.getcwd() + try: + os.chdir(tmp_path) + # Run with --no-txt; expect no username.txt file created + try: + Interactives.run_cli(f"--no-txt {username}") + except Exception: + # Ignore network errors; only assert file is not created + pass + assert not os.path.exists(f"{username}.txt") + finally: + os.chdir(cwd) + + @pytest.mark.parametrize('cliargs', [ '', '--site urghrtuight --egiotr',