From 1e81d43c7ca7eeef09340d59e90dd937613ce421 Mon Sep 17 00:00:00 2001 From: "Darren J. de Lima" Date: Sat, 21 Sep 2024 18:37:56 +0100 Subject: [PATCH 1/7] Add parser for mtgstory.com --- .gitignore | 2 +- plugin/js/parsers/MagicWizardsParser.js | 87 +++++++++++++++++++++++++ plugin/popup.html | 1 + 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 plugin/js/parsers/MagicWizardsParser.js diff --git a/.gitignore b/.gitignore index cdc0f1d8..ab992ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,5 +3,5 @@ eslint/*.zip eslint/packed.js eslint/index.csv node_modules -plugin/**/*.* +!plugin/**/*.* !plugin/jszip/dist/jszip.min.js diff --git a/plugin/js/parsers/MagicWizardsParser.js b/plugin/js/parsers/MagicWizardsParser.js new file mode 100644 index 00000000..72ee3ce3 --- /dev/null +++ b/plugin/js/parsers/MagicWizardsParser.js @@ -0,0 +1,87 @@ +"use strict"; + +// Register the parser for magic.wizards.com (archive.org is implicit) TODO: mtglore.com +parserFactory.register("magic.wizards.com", () => new MagicWizardsParser()); +//parserFactory.register("mtglore.com", () => new MagicWizardsParser()); + +class MagicWizardsParser extends Parser { + constructor() { + super(); + } + + // Extract the list of chapter URLs + async getChapterUrls(dom) { + let chapterLinks = []; + if (window.location.hostname.includes("web.archive.org")) { + // For archived versions, select the correct container within #content + chapterLinks = [...dom.querySelectorAll("#content article a, #content .article-content a")]; + } else { + // For live pages + chapterLinks = [...dom.querySelectorAll("article a, .article-content a, window.location.hostname")]; + } + + // Filter out author links using their URL pattern + chapterLinks = chapterLinks.filter(link => !this.isAuthorLink(link)); + + return chapterLinks.map(this.linkToChapter); + } + + // Helper function to detect if a link is an author link + isAuthorLink(link) { + const href = link.href; + const authorPattern = /\/archive\?author=/; + + // Check if the link matches the author URL pattern or CSS selector + if (authorPattern.test(href)) { + return true; + } else { + return false; + } + } + + // Format chapter links into a standardized structure + linkToChapter(link) { + let titleElement; + + // Try to find the

tag inside the parent of the link (assuming link is inside
) + titleElement = link.closest("article").querySelector("h3"); + + // Fallback to the link text itself if no titleElement found (this handles simpler cases) + let title = titleElement ? titleElement.textContent.trim() : link.textContent.trim(); + + return { + sourceUrl: link.href, + title: title + }; + } + + // Extract the content of the chapter + findContent(dom) { + if (window.location.hostname.includes("web.archive.org")) { + // For archived pages, the content is often inside #content + return dom.querySelector("#content article"); + } else { + // For live pages + return dom.querySelector("#article-body article, #primary-area section, section article, section"); + } + } + +findCoverImageUrl(dom) { + // Try to find an image inside the '.swiper-slide' or inside an 'article' + let imgElement = dom.querySelector(".swiper-slide img, article img"); + + // If an image is found, return its 'src' attribute + if (imgElement) { + return imgElement.getAttribute("src"); + // Check if the URL starts with '//' (protocol-relative URL) + if (imgSrc && imgSrc.startsWith("//")) { + // Add 'https:' to the start of the URL + imgSrc = "https:" + imgSrc; + } + } + // Fallback if no image was found + return null; +} + + +} diff --git a/plugin/popup.html b/plugin/popup.html index 43b43943..b6de4f52 100644 --- a/plugin/popup.html +++ b/plugin/popup.html @@ -636,6 +636,7 @@

Instructions

+ From 3f5df32e90a91c27067c8aa1e71f8919804d00f8 Mon Sep 17 00:00:00 2001 From: "Darren J. de Lima" Date: Sun, 22 Sep 2024 17:10:15 +0100 Subject: [PATCH 2/7] MagicWizardsParser v0.7. Minor changes as requested --- .gitignore | 3 ++- plugin/js/parsers/MagicWizardsParser.js | 23 +++++++---------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/.gitignore b/.gitignore index ab992ea0..7ec7350c 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,5 @@ eslint/packed.js eslint/index.csv node_modules !plugin/**/*.* -!plugin/jszip/dist/jszip.min.js +plugin/jszip/dist/jszip.min.js +package-lock.json diff --git a/plugin/js/parsers/MagicWizardsParser.js b/plugin/js/parsers/MagicWizardsParser.js index 72ee3ce3..bbb97c4c 100644 --- a/plugin/js/parsers/MagicWizardsParser.js +++ b/plugin/js/parsers/MagicWizardsParser.js @@ -1,3 +1,6 @@ +/* + parser for mtgstory.com (redirect) +*/ "use strict"; // Register the parser for magic.wizards.com (archive.org is implicit) TODO: mtglore.com @@ -65,23 +68,11 @@ class MagicWizardsParser extends Parser { return dom.querySelector("#article-body article, #primary-area section, section article, section"); } } - -findCoverImageUrl(dom) { - // Try to find an image inside the '.swiper-slide' or inside an 'article' - let imgElement = dom.querySelector(".swiper-slide img, article img"); - - // If an image is found, return its 'src' attribute - if (imgElement) { - return imgElement.getAttribute("src"); - // Check if the URL starts with '//' (protocol-relative URL) - if (imgSrc && imgSrc.startsWith("//")) { - // Add 'https:' to the start of the URL - imgSrc = "https:" + imgSrc; - } + + // Grab cover image + findCoverImageUrl(dom) { + return util.getFirstImgSrc(dom, ".swiper-slide img, article img"); } - // Fallback if no image was found - return null; -} } From bfda6d9cbf1cc30e91509ccefc0149b6521324bb Mon Sep 17 00:00:00 2001 From: "Darren J. de Lima" Date: Sun, 22 Sep 2024 20:15:47 +0100 Subject: [PATCH 3/7] MagicWizardsParser.js v.71 Improves compatibility with 2016 version of site --- plugin/js/parsers/MagicWizardsParser.js | 31 +++++++++---------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/plugin/js/parsers/MagicWizardsParser.js b/plugin/js/parsers/MagicWizardsParser.js index bbb97c4c..4604d515 100644 --- a/plugin/js/parsers/MagicWizardsParser.js +++ b/plugin/js/parsers/MagicWizardsParser.js @@ -15,18 +15,10 @@ class MagicWizardsParser extends Parser { // Extract the list of chapter URLs async getChapterUrls(dom) { let chapterLinks = []; - if (window.location.hostname.includes("web.archive.org")) { - // For archived versions, select the correct container within #content - chapterLinks = [...dom.querySelectorAll("#content article a, #content .article-content a")]; - } else { - // For live pages - chapterLinks = [...dom.querySelectorAll("article a, .article-content a, window.location.hostname")]; - } - - // Filter out author links using their URL pattern - chapterLinks = chapterLinks.filter(link => !this.isAuthorLink(link)); - - return chapterLinks.map(this.linkToChapter); + chapterLinks = [...dom.querySelectorAll("article a, .article-content a, window.location.hostname, #content article a, #content .article-content a, .articles-listing .article-item a, .articles-bloc .article .details a")]; + // Filter out author links using their URL pattern + chapterLinks = chapterLinks.filter(link => !this.isAuthorLink(link)); + return chapterLinks.map(this.linkToChapter); } // Helper function to detect if a link is an author link @@ -47,7 +39,12 @@ class MagicWizardsParser extends Parser { let titleElement; // Try to find the

tag inside the parent of the link (assuming link is inside
) - titleElement = link.closest("article").querySelector("h3"); + titleElement = link.closest("article")?.querySelector("h3"); + + // Fallback to the

if no

is found + if (!titleElement) { + titleElement = link.closest(".article-item")?.querySelector(".title"); + } // Fallback to the link text itself if no titleElement found (this handles simpler cases) let title = titleElement ? titleElement.textContent.trim() : link.textContent.trim(); @@ -60,13 +57,7 @@ class MagicWizardsParser extends Parser { // Extract the content of the chapter findContent(dom) { - if (window.location.hostname.includes("web.archive.org")) { - // For archived pages, the content is often inside #content - return dom.querySelector("#content article"); - } else { - // For live pages - return dom.querySelector("#article-body article, #primary-area section, section article, section"); - } + return dom.querySelector("#content article, .article_detail #main-content article, #article-body article, #primary-area section, section article, section, .article_detail #main-content"); } // Grab cover image From fd8c87f07f9b8d1fc5838be2323e3fb56936b5b1 Mon Sep 17 00:00:00 2001 From: "Darren J. de Lima" Date: Sun, 22 Sep 2024 21:09:41 +0100 Subject: [PATCH 4/7] MagicWizardsParser.js v0.72 - Generalise title selection Also add TODO to JS. 2024 site and pre-2018 site work and are priority, as they cover all modern stories and older lost chapters. (Ancient MTG articles from pre-2014 not accounted for yet) --- plugin/js/parsers/MagicWizardsParser.js | 40 ++++++++++++++++++------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/plugin/js/parsers/MagicWizardsParser.js b/plugin/js/parsers/MagicWizardsParser.js index 4604d515..f061f97a 100644 --- a/plugin/js/parsers/MagicWizardsParser.js +++ b/plugin/js/parsers/MagicWizardsParser.js @@ -1,11 +1,22 @@ /* - parser for mtgstory.com (redirect) + MagicWizardsParser.js v0.72 + + Parser for Magic the Gathering fiction, found on: + - mtgstory.com (redirect) + - https://magic.wizards.com/en/story (2023-2024) + - https://magic.wizards.com/en/articles/columns/magic-story (2014-2018) + - Archive.org versions of the above + - TODO: mtglore.com (redirects & mirrors) + - TODO: https://magic.wizards.com/en/story (Q4 2018-2022) + - TODO: Planeswalkers & Planes Databank + - TODO: Featured story slider Q1 2018 + - UNTESTED: http://www.wizards.com/Magic/Magazine/Article.aspx (2014 and earlier) + - WONTFIX: hanweirchronicle.com (Tumblr blog, mostly image posts) */ "use strict"; -// Register the parser for magic.wizards.com (archive.org is implicit) TODO: mtglore.com +// Register the parser for magic.wizards.com (archive.org is implicit) parserFactory.register("magic.wizards.com", () => new MagicWizardsParser()); -//parserFactory.register("mtglore.com", () => new MagicWizardsParser()); class MagicWizardsParser extends Parser { constructor() { @@ -36,14 +47,23 @@ class MagicWizardsParser extends Parser { // Format chapter links into a standardized structure linkToChapter(link) { - let titleElement; + const titleSelectors = [ + "h3", // First option:

tag + ".article-item .title", // Second option:

+ ".details .title" // Third option:

+ ]; - // Try to find the

tag inside the parent of the link (assuming link is inside
) - titleElement = link.closest("article")?.querySelector("h3"); - - // Fallback to the

if no

is found - if (!titleElement) { - titleElement = link.closest(".article-item")?.querySelector(".title"); + let titleElement = null; + + // Iterate through the selectors and find the first matching element + for (const selector of titleSelectors) { + titleElement = link.closest("article")?.querySelector(selector) || + link.closest(".article-item")?.querySelector(selector) || + link.closest(".details")?.querySelector(selector); + + if (titleElement) { + break; // Exit the loop if a title element is found + } } // Fallback to the link text itself if no titleElement found (this handles simpler cases) From 6ee49706d08b5e593aa8a2c791e55405682b50dc Mon Sep 17 00:00:00 2001 From: gamebeaker Date: Wed, 25 Sep 2024 17:38:01 +0200 Subject: [PATCH 5/7] try to fix eslint MagicWizardParser --- plugin/js/parsers/MagicWizardsParser.js | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/plugin/js/parsers/MagicWizardsParser.js b/plugin/js/parsers/MagicWizardsParser.js index f061f97a..8f4c6d0e 100644 --- a/plugin/js/parsers/MagicWizardsParser.js +++ b/plugin/js/parsers/MagicWizardsParser.js @@ -26,10 +26,10 @@ class MagicWizardsParser extends Parser { // Extract the list of chapter URLs async getChapterUrls(dom) { let chapterLinks = []; - chapterLinks = [...dom.querySelectorAll("article a, .article-content a, window.location.hostname, #content article a, #content .article-content a, .articles-listing .article-item a, .articles-bloc .article .details a")]; - // Filter out author links using their URL pattern - chapterLinks = chapterLinks.filter(link => !this.isAuthorLink(link)); - return chapterLinks.map(this.linkToChapter); + chapterLinks = [...dom.querySelectorAll("article a, .article-content a, window.location.hostname, #content article a, #content .article-content a, .articles-listing .article-item a, .articles-bloc .article .details a")]; + // Filter out author links using their URL pattern + chapterLinks = chapterLinks.filter(link => !this.isAuthorLink(link)); + return chapterLinks.map(this.linkToChapter); } // Helper function to detect if a link is an author link From cf5dc82fb733906504570fea5ba7c98707b2c8c2 Mon Sep 17 00:00:00 2001 From: gamebeaker Date: Wed, 25 Sep 2024 18:02:02 +0200 Subject: [PATCH 6/7] Update MagicWizardsParser.js change depending on @dteviot in https://github.com/dteviot/WebToEpub/issues/1500#issuecomment-2366947822_ --- plugin/js/parsers/MagicWizardsParser.js | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/plugin/js/parsers/MagicWizardsParser.js b/plugin/js/parsers/MagicWizardsParser.js index 8f4c6d0e..1ad8d570 100644 --- a/plugin/js/parsers/MagicWizardsParser.js +++ b/plugin/js/parsers/MagicWizardsParser.js @@ -38,11 +38,7 @@ class MagicWizardsParser extends Parser { const authorPattern = /\/archive\?author=/; // Check if the link matches the author URL pattern or CSS selector - if (authorPattern.test(href)) { - return true; - } else { - return false; - } + return authorPattern.test(href); } // Format chapter links into a standardized structure From d79da20f94a79a0a13f00b3a53e63ad0e8bfb833 Mon Sep 17 00:00:00 2001 From: gamebeaker Date: Wed, 25 Sep 2024 18:08:31 +0200 Subject: [PATCH 7/7] Add contribution Darthagnon --- package.json | 3 ++- readme.md | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/package.json b/package.json index 9a816632..f5e95da4 100644 --- a/package.json +++ b/package.json @@ -69,7 +69,8 @@ { "name": "ImmortalDreamer"}, { "name": "ktrin"}, { "name": "Tyderion"}, - { "name": "nozwock" } + { "name": "nozwock"}, + { "name": "Darthagnon"} ], "license": "GPL-3.0-only", "bugs": { diff --git a/readme.md b/readme.md index 9b36f3ce..b2c07aa2 100644 --- a/readme.md +++ b/readme.md @@ -58,6 +58,7 @@ Credits * ktrin * nozwock * Tyderion +* Darthagnon ## How to use with Baka-Tsuki: * Browse to a Baka-Tsuki web page that has the full text of a story.