-
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathWattpad2epub.py
179 lines (150 loc) · 7.01 KB
/
Wattpad2epub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#! /usr/bin/python3
"""
# Python Script to Scrape Wattpad Story and convert to Epub and html file.
# By Architrixs, Created Nov 5, 2020.
# This program will create:
# 1. A html file of the entire Wattpad Book AND (You can directly Use this one to read, Images are preserved in this format.)
# 2. A Epub file of the entire Wattpad Book.(The Epub will have separate marked Chapters instead of all chapters as one whole.)
HOW TO USE :
1. Just copy the URL of any Chapter of the Wattpad Book. I repeat copy URL of any "Chapter"... you got it!
2. Either Directly Run
>>Wattpad2epub.py
or >>Wattpad2epub.py Story_url
Yes it can take 1 Commandline Argument as the copied url of the Story.
3. You got html and epub saved in the same location.
"""
import argparse
import bs4
import requests
import pyperclip
import re
import pypandoc
import os
base_apiV2_url = "https://www.wattpad.com/apiv2/"
base_apiV3_url = "https://www.wattpad.com/api/v3/"
dev_error_msg = "Please check the url again, for valid story id. Contact the developer if you think this is a bug."
"""
https://www.wattpad.com/api/v3/stories/{{story_id}}?drafts=0&mature=1&include_deleted=1&fields=id,title,createDate,modifyDate,voteCount,readCount,commentCount,description,url,firstPublishedPart,cover,language,isAdExempt,user(name,username,avatar,location,highlight_colour,backgroundUrl,numLists,numStoriesPublished,numFollowing,numFollowers,twitter),completed,isPaywalled,paidModel,numParts,lastPublishedPart,parts(id,title,length,url,deleted,draft,createDate),tags,categories,rating,rankings,tagRankings,language,storyLanguage,copyright,sourceLink,firstPartId,deleted,draft,hasBannedCover,length
"""
def get_chapter_id(url):
"""Extracts the chapter ID from the given URL."""
search_id = re.compile(r'\d{5,}')
id_match = search_id.search(url)
if id_match:
return id_match.group()
return None
def download_webpage(url):
"""Downloads the webpage content from the given URL."""
try:
res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
res.raise_for_status()
return res.text
except requests.exceptions.RequestException as exc:
print("There was a problem: %s" % (exc))
return None
def extract_useful_data(json_data):
"""Extracts useful data from the JSON response."""
summary = json_data.get('description', '')
tags = json_data.get('tags', '')
chapters = json_data.get('parts', '')
storyName = json_data.get('title', '')
author = json_data.get('user', '')
cover = json_data.get('cover', '')
return summary, tags, chapters, storyName, author, cover
def save_html_file(file_name, story_name, author, cover, tags, summary, chapters):
"""Saves the HTML file with the given data."""
file = open(file_name, 'w', encoding='utf-8')
file.write(f"""
<html>
<head>
<meta name='title' content='{story_name}'>
<meta name='author' content='{author["name"]}' >
</head>
<body>
<div style="text-align:center;">
<img src="{cover}" alt="cover_image">
</div>
<br>
<h5 align="center">{story_name}</h5>
<h6 align="center">By {author["name"]} : <a href="https://www.wattpad.com/user/{author["username"]}">{author["username"]}</a></h6>
<div align="center">Tags: {tags} </div>
<br><br>
<div align="center">{summary}</div>
<br><br>
<div align="left">
<h6>
* If chapter number or names are jumbled up, it's definitely the author's fault.
(Author-san, please number them correctly and in order.)<br>
* Converted using Wattpad2epub by Architrixs<br>
</h6>
</div>
""")
for i, chapter in enumerate(chapters):
print(f"Getting Chapter {i + 1}....")
chapter_url = base_apiV2_url + f"storytext?id={chapter['id']}"
chapter_content = download_webpage(chapter_url)
if chapter_content:
soup_res = bs4.BeautifulSoup(chapter_content, 'html.parser')
file.write(f"""
<br><br>
<h2>Chapter {i + 1}: '{chapter['title']}'</h2><br><br>
{soup_res.prettify()}
""")
file.write("</body></html>")
file.close()
print(f"Saved {file_name}")
def save_epub_file(html_file, story_name, cover):
"""Converts the HTML file to EPUB format and saves it."""
print("Generating EPUB...")
story_name = story_name.replace('/', ' ')
cover_image = f"{story_name}.jpg"
res_img = requests.get(cover, headers={'User-Agent': 'Mozilla/5.0'})
open(cover_image, 'wb').write(res_img.content)
output_file = f"{story_name}.epub"
pypandoc.convert_file(html_file, 'epub3', outputfile=output_file, extra_args=['--epub-chapter-level=2', f'--epub-cover-image={cover_image}'], sandbox=False)
os.remove(cover_image)
print(f"Saved {output_file}")
def main(url):
story_id = get_chapter_id(url)
if not story_id:
print(dev_error_msg)
return
# Getting JSON data from Wattpad API.
story_info_url = base_apiV3_url + f"stories/{story_id}?drafts=0&mature=1&include_deleted=1&fields=id,title,createDate,modifyDate,description,url,firstPublishedPart,cover,language,user(name,username,avatar,location,numStoriesPublished,numFollowing,numFollowers,twitter),completed,numParts,lastPublishedPart,parts(id,title,length,url,deleted,draft,createDate),tags,storyLanguage,copyright"
json_data = requests.get(story_info_url, headers={'User-Agent': 'Mozilla/5.0'}).json()
try:
if json_data.get('result') == 'ERROR':
error_message = json_data.get('message', 'Unknown error')
print(f"Error: {error_message}")
print(dev_error_msg)
return
if json_data.get('error_type') :
error_message = json_data.get('message', 'Unknown error')
print(f"Error: {error_message}")
print(dev_error_msg)
return
if json_data.get('result') == 'ERROR':
error_message = json_data.get('message', 'Unknown error')
print(f"API Error: {error_message}")
return
except Exception as exc:
print(f"Error retrieving JSON data from the API: {exc}")
return
# Extracting useful data from JSON.
summary, tags, chapters, story_name, author, cover = extract_useful_data(json_data)
# Saving HTML file.
html_file_name = f"{story_name}.html"
html_file_name = html_file_name.replace('/', ' ')
save_html_file(html_file_name, story_name, author, cover, tags, summary, chapters)
# Converting HTML to EPUB.
save_epub_file(html_file_name, story_name, cover)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Wattpad2epub: Convert Wattpad stories to EPUB format.')
parser.add_argument('url', nargs='?', help='URL of the Wattpad Story')
args = parser.parse_args()
if args.url:
main(args.url)
else:
# Getting address from clipboard.
url = pyperclip.paste()
main(url)