Merge pull request #1 from VirajMadhu/development

VirajMadhu · web-flow · commit 893d96da9cb5 · 2024-11-02T17:55:54.000+05:30
v1.0.0
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,100 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+emails.txt
diff --git a/MailMiner.py b/MailMiner.py
@@ -0,0 +1,109 @@
+#!/usr/bin/env python3
+# encoding: UTF-8
+
+"""
+    This file is part of MailMiner
+    Copyright (C) 2024 @VirajMadhu
+    https://github.com/VirajMadhu/MailMiner
+    
+    MailMiner is a robust Python tool designed for efficiently extracting 
+    email addresses from websites. You can input a 
+    list of URLs, and MailMiner will dig through each site, uncovering unique 
+    email addresses quickly. Perfect for marketers, researchers, and anyone in
+    need of targeted email collection! 
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+    
+    For more see the file 'LICENSE' for copying permission.
+"""
+
+__author__ = "VirajMadhu"
+__copyright__ = "Copyright (C) 2024 @VirajMadhu"
+__credits__ = ["VirajMadhu"]
+__license__ = "GPLv3"
+__version__ = "1.0.0"
+__maintainer__ = "VirajMadhu"
+
+################################
+
+import re
+import urllib.request
+import time
+
+# Email regex pattern
+emailRegex = re.compile(r'''
+    [a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+
+''', re.VERBOSE)
+
+# Extract emails from page text
+def extract_emails_from_text(text, email_file):
+    extracted_emails = set(emailRegex.findall(text))
+    print(f"\tNumber of Emails Found: {len(extracted_emails)}")
+    for email in extracted_emails:
+        email_file.write(email + "\n")
+
+# Read HTML page content
+def fetch_html_content(url, email_file, index):
+    start_time = time.time()
+    headers = {'User-Agent': 'Mozilla/5.0'}
+    request = urllib.request.Request(url, headers=headers)
+    try:
+        with urllib.request.urlopen(request) as response:
+            page_content = response.read().decode('utf-8', errors='ignore')
+            print(f"{index}. {url}\tFetched in : {time.time() - start_time:.2f} seconds")
+            extract_emails_from_text(page_content, email_file)
+    except urllib.error.HTTPError as err:
+        handle_http_error(url, err, email_file, index)
+    except urllib.error.URLError as err:
+        print(f"URLError for {url}: {err}")
+    except Exception as e:
+        print(f"An error occurred with {url}: {e}")
+
+# Handle HTTP errors
+def handle_http_error(url, error, email_file, index):
+    print(f"HTTPError for {url}: {error}")
+    if error.code == 404:
+        cached_url = f'http://webcache.googleusercontent.com/search?q=cache:{url}'
+        print(f"Trying cached version for {url}")
+        try:
+            fetch_html_content(cached_url, email_file, index)
+        except Exception as e:
+            print(f"Failed to fetch cached version for {url}: {e}")
+
+# Main function
+def main():
+    start_time = time.time()
+    url_found = False
+    
+    with open("urls.txt", 'r') as url_file, open("emails.txt", 'a') as email_file:
+        for i, url_link in enumerate(url_file, start=1):
+            url_link = url_link.strip().strip('\'"')
+            
+            # Skip empty lines and lines starting with "#"
+            if not url_link or url_link.startswith("#"):
+                continue
+            
+            # Add http prefix if missing
+            if not url_link.startswith("http"):
+                url_link = "http://" + url_link
+            
+            fetch_html_content(url_link, email_file, i)
+
+    if not url_found:
+        print("No Valid URLs found in the urls.txt file")
+    else:
+        print(f"Elapsed Time: {time.time() - start_time:.2f} seconds")
+
+if __name__ == "__main__":
+    main()
diff --git a/README.md b/README.md
@@ -0,0 +1,57 @@
+# 📧 MailMiner  
+
+🚀 This **MailMiner** tool lets you quickly scan **multiple websites** to collect **unique** email addresses with ease.
+
+---
+
+## 📋 Project Overview
+
+Use this script to effortlessly retrieve email addresses from bulk website lists. Perfect for marketers, researchers, or anyone looking to save time on email collection!
+
+## 📂 Getting Started
+
+### 🎯 Prerequisites
+
+Ensure you have the following installed before getting started:
+
+- [**Python 3.x**](https://www.python.org/downloads/) - The programming language
+- **urllib.request** — Python library for handling URLs (pre-installed with Python 3)
+
+### ⚙️ Setup Instructions
+
+1. **Clone or download the repository** 📥
+2. **Prepare a `urls.txt` file**:
+   - Place your target URLs in this file (one URL per line).
+3. **Run the script** 🏃‍♀️:
+   ```
+   python3 MailMiner.py
+   ```
+   - Let it scrape email addresses from all URLs in `urls.txt`.
+4. **Enjoy!** 🎉 All harvested email addresses will be saved in `emails.txt`.
+
+### 🔍 Correct URL or Domain Formats
+
+Make sure your URLs follow these formats for best results:
+
+- 🌐 https://github.com
+- 🌐 https://github.com/anyuser
+- 🌐 github.com
+- 🌐 https://support.github.com
+
+---
+
+## 🔧 Built With
+
+- **Python 3.x** - [Learn More](https://www.python.org/)
+
+## 🤝 Contributing
+
+We welcome contributions! If you want to improve or expand this tool, feel free to submit a pull request. Let's make this tool better together! 🌟
+
+## ✍️ Authors
+
+- **Viraj Madhushan** - *Initial work* - [Viraj on GitHub](https://github.com/VirajMadhu)
+
+---
+
+Give this a go and save time on email collection! ✨
diff --git a/urls.txt b/urls.txt
@@ -0,0 +1 @@
+# ADD URLS HERE - ONE URL PER LINE -ALSO DONOT REMOVE # OF THIS LINE

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Auto detect text files and perform LF normalization`
	`2`	`+* text=auto`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# ADD URLS HERE - ONE URL PER LINE -ALSO DONOT REMOVE # OF THIS LINE`