Refactor source loading and update checking #999

dipu-bd · Aug 9, 2021 · f107fe5 · f107fe5
1 parent bb1f8d7
commit f107fe5
Show file tree

Hide file tree

Showing 20 changed files with 468 additions and 471 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,9 @@
+# EditorConfig is awesome: http://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+* text=auto eol=lf
diff --git a/README.md b/README.md
diff --git a/lncrawl/bots/console/start.py b/lncrawl/bots/console/start.py
@@ -1,13 +1,12 @@
 # -*- coding: utf-8 -*-
-import os
 from urllib.parse import urlparse
 
 from questionary import prompt
 
 from ...core import display
 from ...core.app import App
 from ...core.arguments import get_args
-from ...sources import add_all_crawlers, rejected_sources
+from ...core.sources import rejected_sources
 from .open_folder_prompt import display_open_folder
 from .resume_download import resume_session
 
@@ -18,12 +17,6 @@ def start(self):
         raise Exception('Unknown self: ' + type(self))
 
     args = get_args()
-    for crawler_file in args.crawler:
-        if os.path.isfile(crawler_file):
-            add_all_crawlers(crawler_file)
-        # end if
-    # end if
-
     if args.list_sources:
         display.url_supported_list()
         return

diff --git a/lncrawl/bots/telegram.py b/lncrawl/bots/telegram.py
@@ -10,7 +10,6 @@
                           MessageHandler, Updater)
 
 from ..core.app import App
-from ..sources import crawler_list
 from ..utils.uploader import upload
 
 logger = logging.getLogger(__name__)
@@ -28,7 +27,7 @@
 class TelegramBot:
     def start(self):
         os.environ['debug_mode'] = 'yes'
-        
+
         # Create the EventHandler and pass it your bot's token.
         self.updater = Updater(
             os.getenv('TELEGRAM_TOKEN', ''),

diff --git a/lncrawl/bots/test/__init__.py b/lncrawl/bots/test/__init__.py
@@ -2,27 +2,22 @@
 """
 The purpose of this bot is to test the application and crawlers
 """
-import io
-import sys
 import traceback
 from random import random
 
+from cloudscraper.exceptions import CaptchaException, CloudflareException
 from requests import RequestException
 from urllib3.exceptions import HTTPError
-from cloudscraper.exceptions import CloudflareException, CaptchaException
 
-from ...assets.icons import isWindows
-from ...sources import crawler_list
+from ...core.sources import crawler_list
 
 
 class TestBot:
     allerrors = dict()
 
-    from .test_inputs import test_user_inputs
-    from .test_inputs import allowed_failures
-
-    from .test_crawler import test_crawler
     from .post_github import post_on_github
+    from .test_crawler import test_crawler
+    from .test_inputs import allowed_failures, test_user_inputs
 
     def start(self):
         try:

diff --git a/lncrawl/core/__init__.py b/lncrawl/core/__init__.py
@@ -5,15 +5,16 @@
 import logging
 import os
 import sys
+
 import colorama
 from colorama import Fore
 
 from ..assets.version import get_value as get_version
 from ..bots import run_bot
-from ..utils.update_checker import check_updates
 from .arguments import get_args
 from .display import (cancel_method, debug_mode, description, epilog,
                       error_message, input_suppression)
+from .sources import load_sources
 
 logger = logging.getLogger(__name__)
 
@@ -65,7 +66,7 @@ def init():
 def start_app():
     init()
 
-    check_updates()
+    load_sources()
     cancel_method()
 
     try:

diff --git a/lncrawl/core/app.py b/lncrawl/core/app.py
@@ -5,13 +5,13 @@
 from typing import Any, Dict, List, Optional, Tuple
 from urllib.parse import urlparse
 
+from lncrawl.core.crawler import Crawler
 from slugify import slugify
 
 from .. import constants as C
 from ..binders import available_formats, generate_books
-from ..sources import crawler_list
-from lncrawl.core.crawler import Crawler
-from .downloader import download_chapters, download_chapter_images
+from ..core.sources import crawler_list, rejected_sources
+from .downloader import download_chapter_images, download_chapters
 from .novel_info import format_novel, save_metadata
 from .novel_search import search_novels
 
@@ -93,24 +93,25 @@ def search_novel(self):
     # ----------------------------------------------------------------------- #
 
     def init_crawler(self, novel_url):
-        '''Requires: [user_input]'''
-        '''Produces: crawler'''
         if not novel_url:
             return
         # end if
-        hostname = urlparse(novel_url).hostname
-        for home_url, create_crawler in crawler_list.items():
-            if hostname == urlparse(home_url).hostname:
-                logger.info('Initializing crawler for: %s', home_url)
-                self.crawler: Crawler = create_crawler()
-                self.crawler.novel_url = novel_url
-                self.crawler.home_url = home_url.strip('/')
-                break
-            # end if
-        # end for
-        if not self.crawler:
-            raise Exception('No crawlers were found')
+
+        parsed_url = urlparse(novel_url)
+        base_url = '%s//%s/' % (parsed_url.scheme, parsed_url.hostname)
+        if base_url in rejected_sources:
+            raise Exception('Source is rejected')
+        # end if
+
+        CrawlerType = crawler_list.get(base_url)
+        if not CrawlerType:
+            raise Exception('No crawler found')
         # end if
+
+        logger.info('Initializing crawler for: %s', base_url)
+        self.crawler = CrawlerType()
+        self.crawler.home_url = base_url
+        self.crawler.novel_url = novel_url
     # end def
 
     def can_do(self, prop_name):
@@ -125,7 +126,7 @@ def get_novel_info(self):
 
         self.crawler.initialize()
         self.crawler.scraper.headers['origin'] = self.crawler.home_url
-        self.crawler.scraper.headers['referer'] = self.crawler.home_url + '/'
+        self.crawler.scraper.headers['referer'] = self.crawler.home_url
 
         if self.can_do('login') and self.login_data:
             logger.debug('Login with %s', self.login_data)

diff --git a/lncrawl/core/crawler.py b/lncrawl/core/crawler.py
@@ -7,6 +7,7 @@
 import random
 import re
 import sys
+from typing import Dict, List
 import unicodedata
 from abc import abstractmethod
 from concurrent.futures import ThreadPoolExecutor
@@ -29,7 +30,7 @@
 NONPRINTABLE_MAPPING = {character: None for character in NONPRINTABLE}
 
 
-class Crawler:
+class Crawler(object):
     '''Blueprint for creating new crawlers'''
 
     def __init__(self) -> None:
@@ -97,7 +98,7 @@ def logout(self) -> None:
     # end def
 
     @abstractmethod
-    def search_novel(self, query):
+    def search_novel(self, query) -> List[Dict[str, str]]:
         '''Gets a list of results matching the given query'''
         pass
     # end def

diff --git a/lncrawl/core/display.py b/lncrawl/core/display.py
@@ -5,7 +5,6 @@
 from colorama import Back, Fore, Style
 
 from ..assets.icons import Icons
-from ..sources import crawler_list
 
 LINE_SIZE = 80
 ENABLE_BANNER = not Icons.isWindows
@@ -119,6 +118,7 @@ def new_version_news(latest):
 
 
 def url_supported_list():
+    from .sources import crawler_list
     print('List of %d supported sources:' % len(crawler_list))
     for url in sorted(crawler_list.keys()):
         print(Fore.LIGHTGREEN_EX, Icons.RIGHT_ARROW, url, Fore.RESET)

diff --git a/lncrawl/core/novel_search.py b/lncrawl/core/novel_search.py
@@ -6,10 +6,10 @@
 import os
 from concurrent import futures
 
-from tqdm import tqdm
 from slugify import slugify
+from tqdm import tqdm
 
-from ..sources import crawler_list
+from ..core.sources import crawler_list
 
 logger = logging.getLogger(__name__)
 
@@ -18,9 +18,9 @@
 
 def get_search_result(app, link, bar):
     try:
-        crawler = crawler_list[link]
-        instance = crawler()
-        instance.home_url = link.strip('/')
+        CrawlerType = crawler_list[link]
+        instance = CrawlerType()
+        instance.home_url = link
         results = instance.search_novel(app.user_input)
         logger.debug(results)
         logger.info('%d results from %s', len(results), link)