-
Notifications
You must be signed in to change notification settings - Fork 142
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
eabd126
commit b73eaba
Showing
2 changed files
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
/* | ||
Template to use to create a new parser | ||
*/ | ||
"use strict"; | ||
|
||
// Use one or more of these to specify when the parser is to be used | ||
|
||
// Use this function if site's host name is sufficient. | ||
// i.e. All pages are on same site, and use same format. | ||
parserFactory.register("xyparser.org", () => new xyParser()); | ||
|
||
|
||
|
||
class xyParser extends Parser{ | ||
constructor() { | ||
super(); | ||
//Optional Parameters: | ||
|
||
/* | ||
// Minimum delay (in ms) between page requests. Useful for 403 error prevention. | ||
// If the sites this parser accesses throttles requests or uses cloudflare, it is recommended to set this. | ||
this.minimumThrottle = 3000; | ||
*/ | ||
} | ||
|
||
//overwrite Max web pages to fetch simultaneously mostly used on sites that block multiple requests | ||
/* | ||
clampSimultanousFetchSize() { | ||
return 1; | ||
} | ||
*/ | ||
|
||
// returns promise with the URLs of the chapters to fetch | ||
// promise is used because may need to fetch the list of URLs from internet | ||
/* | ||
async getChapterUrls(dom, chapterUrlsUI) { | ||
// Most common implementation is to find element holding the hyperlinks to | ||
// the web pages holding the chapters. Then call util.hyperlinksToChapterList() | ||
// to convert the links into a list of URLs the parser will collect. | ||
let menu = dom.querySelector("div.su-tabs-panes"); | ||
return util.hyperlinksToChapterList(menu); | ||
// Almost as common, find links on page and convert. | ||
return [...dom.querySelectorAll("li.wp-manga-chapter.free-chap a")] | ||
.map(a => util.hyperLinkToChapter(a)); | ||
// Need to walk multiple ToC pages, page by page | ||
return (await this.walkTocPages(dom, | ||
TemplateParser.chaptersFromDom, | ||
TemplateParser.nextTocPageUrl, | ||
chapterUrlsUI | ||
)); | ||
// Can get list of all ToC pages | ||
let tocPage1chapters = TemplateParser.extractPartialChapterList(dom); | ||
let urlsOfTocPages = TemplateParser.getUrlsOfTocPages(dom); | ||
return (await this.getChaptersFromAllTocPages(tocPage1chapters, | ||
TemplateParser.extractPartialChapterList, | ||
urlsOfTocPages, | ||
chapterUrlsUI | ||
)); | ||
} | ||
*/ | ||
|
||
// returns the element holding the story content in a chapter | ||
/* | ||
findContent(dom) { | ||
// typical implementation is find node with all wanted content | ||
// return is the element holding just the wanted content. | ||
return dom.querySelector("article"); | ||
} | ||
*/ | ||
|
||
// title of the story (not to be confused with title of each chapter) | ||
/* | ||
extractTitleImpl(dom) { | ||
// typical implementation is find node with the Title and return name from title | ||
// NOTE. Can return Title as a string, or an HTML element | ||
return dom.querySelector("h1"); | ||
} | ||
*/ | ||
|
||
// author of the story | ||
// Optional, if not provided, will default to "<unknown>" | ||
/* | ||
extractAuthor(dom) { | ||
// typical implementation is find node with the author's name and return name from title | ||
// Major points to note | ||
// 1. Return the Author's name as a string, not a HTML element | ||
// 2. If can't find Author, call the base implementation | ||
let authorLabel = dom.querySelector(".meta span a"); | ||
return authorLabel?.textContent ?? super.extractAuthor(dom); | ||
} | ||
*/ | ||
|
||
// language used | ||
// Optional, if not provided, will default to ISO code for English "en" | ||
/* | ||
extractLanguage(dom) { | ||
return dom.querySelector("html").getAttribute("lang"); | ||
} | ||
*/ | ||
|
||
// Genre of the story | ||
// Optional, Genre for metadata, if not provided, will default to "" | ||
/* | ||
extractSubject(dom) { | ||
let tags = [...dom.querySelectorAll("[property='genre']")]; | ||
return tags.map(e => e.textContent.trim()).join(", "); | ||
} | ||
*/ | ||
|
||
// Description of the story | ||
// Optional, Description for metadata, if not provided, will default to "" | ||
/* | ||
extractDescription(dom) { | ||
return dom.querySelector("div [property='description']").textContent.trim(); | ||
} | ||
*/ | ||
|
||
// Optional, supply if need to do special manipulation of content | ||
// e.g. decrypt content | ||
/* | ||
customRawDomToContentStep(chapter, content) { | ||
// for example of this, refer to LnmtlParser | ||
} | ||
*/ | ||
|
||
// Optional, supply if need to do custom cleanup of content | ||
/* | ||
removeUnwantedElementsFromContentElement(element) { | ||
util.removeChildElementsMatchingCss(element, "button"); | ||
super.removeUnwantedElementsFromContentElement(element); | ||
} | ||
*/ | ||
|
||
// Optional, supply if individual chapter titles are not inside the content element | ||
/* | ||
findChapterTitle(dom) { | ||
// typical implementation is find node with the Title | ||
// Return Title element, OR the title as a string | ||
return dom.querySelector("h3.dashhead-title"); | ||
} | ||
*/ | ||
|
||
// Optional, if "next/previous chapter" are nested inside other elements, | ||
// this says how to find the highest parent element to remove | ||
/* | ||
findParentNodeOfChapterLinkToRemoveAt(link) { | ||
// The links may be wrapped, so need to walk up tree to find the | ||
// highest element holding the chapter links. | ||
// e.g. Following code assumes links are sometimes enclosed in a <strong> tag | ||
// that is enclosed in a <p> tag. We want to remove the <p> tag | ||
// and everything inside it | ||
let toRemove = util.moveIfParent(link, "strong"); | ||
return util.moveIfParent(toRemove, "p"); | ||
} | ||
*/ | ||
|
||
// Optional, supply if cover image can usually be found on inital web page | ||
// Notes. | ||
// 1. If cover image is first image in content section, do not implement this function | ||
/* | ||
findCoverImageUrl(dom) { | ||
// Most common implementation is get first image in specified container. e.g. | ||
return util.getFirstImgSrc(dom, "div.td-ss-main-sidebar"); | ||
} | ||
*/ | ||
|
||
// Optional, supply if need to chase hyperlinks in page to get all chapter content | ||
/* | ||
async fetchChapter(url) { | ||
return (await HttpClient.wrapFetch(url)).responseXML; | ||
} | ||
*/ | ||
|
||
// Optional, supply if need to modify DOM before normal processing steps | ||
/* | ||
preprocessRawDom(webPageDom) { | ||
} | ||
*/ | ||
|
||
// Optional, called when user presses the "Pack EPUB" button. | ||
// Implement if parser needs to do anything after user sets UI settings | ||
// but before collecting pages | ||
/* | ||
onStartCollecting() { | ||
} | ||
*/ | ||
|
||
// Optional, Return elements from page | ||
// that are to be shown on epub's "information" page | ||
/* | ||
getInformationEpubItemChildNodes(dom) { | ||
return [...dom.querySelectorAll("div.novel-details")]; | ||
} | ||
*/ | ||
|
||
// Optional, Any cleanup operations to perform on the nodes | ||
// returned by getInformationEpubItemChildNodes | ||
/* | ||
cleanInformationNode(node) { | ||
return node; | ||
} | ||
*/ | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters