Skip to content

Commit 893d96d

Browse files
authored
Merge pull request #1 from VirajMadhu/development
v1.0.0
2 parents c8e21e8 + 9884896 commit 893d96d

File tree

5 files changed

+269
-0
lines changed

5 files changed

+269
-0
lines changed

.gitattributes

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Auto detect text files and perform LF normalization
2+
* text=auto

.gitignore

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
*.egg-info/
24+
.installed.cfg
25+
*.egg
26+
27+
# PyInstaller
28+
# Usually these files are written by a python script from a template
29+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
30+
*.manifest
31+
*.spec
32+
33+
# Installer logs
34+
pip-log.txt
35+
pip-delete-this-directory.txt
36+
37+
# Unit test / coverage reports
38+
htmlcov/
39+
.tox/
40+
.coverage
41+
.coverage.*
42+
.cache
43+
nosetests.xml
44+
coverage.xml
45+
*.cover
46+
.hypothesis/
47+
48+
# Translations
49+
*.mo
50+
*.pot
51+
52+
# Django stuff:
53+
*.log
54+
local_settings.py
55+
56+
# Flask stuff:
57+
instance/
58+
.webassets-cache
59+
60+
# Scrapy stuff:
61+
.scrapy
62+
63+
# Sphinx documentation
64+
docs/_build/
65+
66+
# PyBuilder
67+
target/
68+
69+
# Jupyter Notebook
70+
.ipynb_checkpoints
71+
72+
# pyenv
73+
.python-version
74+
75+
# celery beat schedule file
76+
celerybeat-schedule
77+
78+
# SageMath parsed files
79+
*.sage.py
80+
81+
# Environments
82+
.env
83+
.venv
84+
env/
85+
venv/
86+
ENV/
87+
88+
# Spyder project settings
89+
.spyderproject
90+
.spyproject
91+
92+
# Rope project settings
93+
.ropeproject
94+
95+
# mkdocs documentation
96+
/site
97+
98+
# mypy
99+
.mypy_cache/
100+
emails.txt

MailMiner.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#!/usr/bin/env python3
2+
# encoding: UTF-8
3+
4+
"""
5+
This file is part of MailMiner
6+
Copyright (C) 2024 @VirajMadhu
7+
https://github.com/VirajMadhu/MailMiner
8+
9+
MailMiner is a robust Python tool designed for efficiently extracting
10+
email addresses from websites. You can input a
11+
list of URLs, and MailMiner will dig through each site, uncovering unique
12+
email addresses quickly. Perfect for marketers, researchers, and anyone in
13+
need of targeted email collection!
14+
15+
This program is free software: you can redistribute it and/or modify
16+
it under the terms of the GNU General Public License as published by
17+
the Free Software Foundation, either version 3 of the License, or
18+
(at your option) any later version.
19+
20+
This program is distributed in the hope that it will be useful,
21+
but WITHOUT ANY WARRANTY; without even the implied warranty of
22+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23+
GNU General Public License for more details.
24+
25+
You should have received a copy of the GNU General Public License
26+
along with this program. If not, see <http://www.gnu.org/licenses/>.
27+
28+
For more see the file 'LICENSE' for copying permission.
29+
"""
30+
31+
__author__ = "VirajMadhu"
32+
__copyright__ = "Copyright (C) 2024 @VirajMadhu"
33+
__credits__ = ["VirajMadhu"]
34+
__license__ = "GPLv3"
35+
__version__ = "1.0.0"
36+
__maintainer__ = "VirajMadhu"
37+
38+
################################
39+
40+
import re
41+
import urllib.request
42+
import time
43+
44+
# Email regex pattern
45+
emailRegex = re.compile(r'''
46+
[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+
47+
''', re.VERBOSE)
48+
49+
# Extract emails from page text
50+
def extract_emails_from_text(text, email_file):
51+
extracted_emails = set(emailRegex.findall(text))
52+
print(f"\tNumber of Emails Found: {len(extracted_emails)}")
53+
for email in extracted_emails:
54+
email_file.write(email + "\n")
55+
56+
# Read HTML page content
57+
def fetch_html_content(url, email_file, index):
58+
start_time = time.time()
59+
headers = {'User-Agent': 'Mozilla/5.0'}
60+
request = urllib.request.Request(url, headers=headers)
61+
try:
62+
with urllib.request.urlopen(request) as response:
63+
page_content = response.read().decode('utf-8', errors='ignore')
64+
print(f"{index}. {url}\tFetched in : {time.time() - start_time:.2f} seconds")
65+
extract_emails_from_text(page_content, email_file)
66+
except urllib.error.HTTPError as err:
67+
handle_http_error(url, err, email_file, index)
68+
except urllib.error.URLError as err:
69+
print(f"URLError for {url}: {err}")
70+
except Exception as e:
71+
print(f"An error occurred with {url}: {e}")
72+
73+
# Handle HTTP errors
74+
def handle_http_error(url, error, email_file, index):
75+
print(f"HTTPError for {url}: {error}")
76+
if error.code == 404:
77+
cached_url = f'http://webcache.googleusercontent.com/search?q=cache:{url}'
78+
print(f"Trying cached version for {url}")
79+
try:
80+
fetch_html_content(cached_url, email_file, index)
81+
except Exception as e:
82+
print(f"Failed to fetch cached version for {url}: {e}")
83+
84+
# Main function
85+
def main():
86+
start_time = time.time()
87+
url_found = False
88+
89+
with open("urls.txt", 'r') as url_file, open("emails.txt", 'a') as email_file:
90+
for i, url_link in enumerate(url_file, start=1):
91+
url_link = url_link.strip().strip('\'"')
92+
93+
# Skip empty lines and lines starting with "#"
94+
if not url_link or url_link.startswith("#"):
95+
continue
96+
97+
# Add http prefix if missing
98+
if not url_link.startswith("http"):
99+
url_link = "http://" + url_link
100+
101+
fetch_html_content(url_link, email_file, i)
102+
103+
if not url_found:
104+
print("No Valid URLs found in the urls.txt file")
105+
else:
106+
print(f"Elapsed Time: {time.time() - start_time:.2f} seconds")
107+
108+
if __name__ == "__main__":
109+
main()

README.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# 📧 MailMiner
2+
3+
🚀 This **MailMiner** tool lets you quickly scan **multiple websites** to collect **unique** email addresses with ease.
4+
5+
---
6+
7+
## 📋 Project Overview
8+
9+
Use this script to effortlessly retrieve email addresses from bulk website lists. Perfect for marketers, researchers, or anyone looking to save time on email collection!
10+
11+
## 📂 Getting Started
12+
13+
### 🎯 Prerequisites
14+
15+
Ensure you have the following installed before getting started:
16+
17+
- [**Python 3.x**](https://www.python.org/downloads/) - The programming language
18+
- **urllib.request** — Python library for handling URLs (pre-installed with Python 3)
19+
20+
### ⚙️ Setup Instructions
21+
22+
1. **Clone or download the repository** 📥
23+
2. **Prepare a `urls.txt` file**:
24+
- Place your target URLs in this file (one URL per line).
25+
3. **Run the script** 🏃‍♀️:
26+
```
27+
python3 MailMiner.py
28+
```
29+
- Let it scrape email addresses from all URLs in `urls.txt`.
30+
4. **Enjoy!** 🎉 All harvested email addresses will be saved in `emails.txt`.
31+
32+
### 🔍 Correct URL or Domain Formats
33+
34+
Make sure your URLs follow these formats for best results:
35+
36+
- 🌐 https://github.com
37+
- 🌐 https://github.com/anyuser
38+
- 🌐 github.com
39+
- 🌐 https://support.github.com
40+
41+
---
42+
43+
## 🔧 Built With
44+
45+
- **Python 3.x** - [Learn More](https://www.python.org/)
46+
47+
## 🤝 Contributing
48+
49+
We welcome contributions! If you want to improve or expand this tool, feel free to submit a pull request. Let's make this tool better together! 🌟
50+
51+
## ✍️ Authors
52+
53+
- **Viraj Madhushan** - *Initial work* - [Viraj on GitHub](https://github.com/VirajMadhu)
54+
55+
---
56+
57+
Give this a go and save time on email collection! ✨

urls.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# ADD URLS HERE - ONE URL PER LINE -ALSO DONOT REMOVE # OF THIS LINE

0 commit comments

Comments
 (0)