|
| 1 | +#!/usr/bin/env python3 |
| 2 | +# encoding: UTF-8 |
| 3 | + |
| 4 | +""" |
| 5 | + This file is part of MailMiner |
| 6 | + Copyright (C) 2024 @VirajMadhu |
| 7 | + https://github.com/VirajMadhu/MailMiner |
| 8 | + |
| 9 | + MailMiner is a robust Python tool designed for efficiently extracting |
| 10 | + email addresses from websites. You can input a |
| 11 | + list of URLs, and MailMiner will dig through each site, uncovering unique |
| 12 | + email addresses quickly. Perfect for marketers, researchers, and anyone in |
| 13 | + need of targeted email collection! |
| 14 | +
|
| 15 | + This program is free software: you can redistribute it and/or modify |
| 16 | + it under the terms of the GNU General Public License as published by |
| 17 | + the Free Software Foundation, either version 3 of the License, or |
| 18 | + (at your option) any later version. |
| 19 | +
|
| 20 | + This program is distributed in the hope that it will be useful, |
| 21 | + but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 22 | + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 23 | + GNU General Public License for more details. |
| 24 | +
|
| 25 | + You should have received a copy of the GNU General Public License |
| 26 | + along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 27 | + |
| 28 | + For more see the file 'LICENSE' for copying permission. |
| 29 | +""" |
| 30 | + |
| 31 | +__author__ = "VirajMadhu" |
| 32 | +__copyright__ = "Copyright (C) 2024 @VirajMadhu" |
| 33 | +__credits__ = ["VirajMadhu"] |
| 34 | +__license__ = "GPLv3" |
| 35 | +__version__ = "1.0.0" |
| 36 | +__maintainer__ = "VirajMadhu" |
| 37 | + |
| 38 | +################################ |
| 39 | + |
| 40 | +import re |
| 41 | +import urllib.request |
| 42 | +import time |
| 43 | + |
| 44 | +# Email regex pattern |
| 45 | +emailRegex = re.compile(r''' |
| 46 | + [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+ |
| 47 | +''', re.VERBOSE) |
| 48 | + |
| 49 | +# Extract emails from page text |
| 50 | +def extract_emails_from_text(text, email_file): |
| 51 | + extracted_emails = set(emailRegex.findall(text)) |
| 52 | + print(f"\tNumber of Emails Found: {len(extracted_emails)}") |
| 53 | + for email in extracted_emails: |
| 54 | + email_file.write(email + "\n") |
| 55 | + |
| 56 | +# Read HTML page content |
| 57 | +def fetch_html_content(url, email_file, index): |
| 58 | + start_time = time.time() |
| 59 | + headers = {'User-Agent': 'Mozilla/5.0'} |
| 60 | + request = urllib.request.Request(url, headers=headers) |
| 61 | + try: |
| 62 | + with urllib.request.urlopen(request) as response: |
| 63 | + page_content = response.read().decode('utf-8', errors='ignore') |
| 64 | + print(f"{index}. {url}\tFetched in : {time.time() - start_time:.2f} seconds") |
| 65 | + extract_emails_from_text(page_content, email_file) |
| 66 | + except urllib.error.HTTPError as err: |
| 67 | + handle_http_error(url, err, email_file, index) |
| 68 | + except urllib.error.URLError as err: |
| 69 | + print(f"URLError for {url}: {err}") |
| 70 | + except Exception as e: |
| 71 | + print(f"An error occurred with {url}: {e}") |
| 72 | + |
| 73 | +# Handle HTTP errors |
| 74 | +def handle_http_error(url, error, email_file, index): |
| 75 | + print(f"HTTPError for {url}: {error}") |
| 76 | + if error.code == 404: |
| 77 | + cached_url = f'http://webcache.googleusercontent.com/search?q=cache:{url}' |
| 78 | + print(f"Trying cached version for {url}") |
| 79 | + try: |
| 80 | + fetch_html_content(cached_url, email_file, index) |
| 81 | + except Exception as e: |
| 82 | + print(f"Failed to fetch cached version for {url}: {e}") |
| 83 | + |
| 84 | +# Main function |
| 85 | +def main(): |
| 86 | + start_time = time.time() |
| 87 | + url_found = False |
| 88 | + |
| 89 | + with open("urls.txt", 'r') as url_file, open("emails.txt", 'a') as email_file: |
| 90 | + for i, url_link in enumerate(url_file, start=1): |
| 91 | + url_link = url_link.strip().strip('\'"') |
| 92 | + |
| 93 | + # Skip empty lines and lines starting with "#" |
| 94 | + if not url_link or url_link.startswith("#"): |
| 95 | + continue |
| 96 | + |
| 97 | + # Add http prefix if missing |
| 98 | + if not url_link.startswith("http"): |
| 99 | + url_link = "http://" + url_link |
| 100 | + |
| 101 | + fetch_html_content(url_link, email_file, i) |
| 102 | + |
| 103 | + if not url_found: |
| 104 | + print("No Valid URLs found in the urls.txt file") |
| 105 | + else: |
| 106 | + print(f"Elapsed Time: {time.time() - start_time:.2f} seconds") |
| 107 | + |
| 108 | +if __name__ == "__main__": |
| 109 | + main() |
0 commit comments