From f8b85b0c75c94e8d2889cec7bbed173e841e59c7 Mon Sep 17 00:00:00 2001
From: Giovanni Fiordeponti <38134891+giovf96@users.noreply.github.com>
Date: Sun, 16 Oct 2022 16:08:02 +0200
Subject: [PATCH] Feature: Add support to read/save web articles(audio) (#20)

* Add article_web_scraper.py

* Fix test

* Fix file name

* Remove html_text_formattings from main

* Remove unused imports

* Refactor article_web_scraper

* Add support for pagination in read/store article

* Fix logging

Co-authored-by: Deepak Raj <54245038+codePerfectPlus@users.noreply.github.com>
---
 audiobook/__init__.py            |  2 +-
 audiobook/article_web_scraper.py | 50 +++++++++++++++++++++++++++++++
 audiobook/main.py                | 51 ++++++++++++++++++++++++++++----
 requirements.txt                 |  2 +-
 tests/test_audiobook.py          | 20 ++++++++-----
 5 files changed, 109 insertions(+), 16 deletions(-)
 create mode 100644 audiobook/article_web_scraper.py
diff --git a/audiobook/__init__.py b/audiobook/__init__.py
index b2a3a40..86c95fc 100644
--- a/audiobook/__init__.py
+++ b/audiobook/__init__.py
@@ -1 +1 @@
-from audiobook.main import AudioBook
+from audiobook.main import AudioBook
\ No newline at end of file
diff --git a/audiobook/article_web_scraper.py b/audiobook/article_web_scraper.py
new file mode 100644
index 0000000..13a990f
--- /dev/null
+++ b/audiobook/article_web_scraper.py
@@ -0,0 +1,50 @@
+import requests
+
+from bs4 import BeautifulSoup
+
+html_text_formattings = ["p", "a", "b", "strong", "i", "em", "mark", "small", "del", "ins", "sub", "sup"]
+
+class ArticleWebScraper:
+    """
+    ArticleWebScraper class
+    
+    methods:
+        get_json_from_web_article: returns a json from a non-empty <article> tag
+        get_title_from_article: returns the <title> tag from the html page
+        
+    sample usage:
+        ab = AudioBook(speed="normal")
+        ab.read_book(file_path, password="abcd")
+    """
+    
+    def __init__(self, article_url):
+        page = requests.get(article_url)
+        self.article_url = article_url
+        self.soup = BeautifulSoup(page.content, "html.parser")
+
+    def get_title_from_article (self):
+        """ returns the <title> tag from the html page """
+        return self.soup.title.text
+    
+    def get_json_from_web_article (self):
+        """ returns a json from a non-empty <article> tag """
+        if hasattr(self.soup, 'article') and self.soup.article is not None: 
+            article_text_tag_items = [
+                self.soup.article.findChildren(text_formatting , recursive=True) 
+                for text_formatting in html_text_formattings
+            ]
+
+            json_article = {}
+            text_lines = []
+            # list(dict.fromkeys(lines))) removes duplicate words in same tag type
+            for article_text_tag_item in article_text_tag_items:
+                for article_text_tag in article_text_tag_item:
+                    text_line = list(dict.fromkeys([tag.string for tag in article_text_tag if tag.string is not None])) 
+                    text_lines += text_line
+            # list(dict.fromkeys(lines))) removes duplicate words among all tags
+            text_lines = list(dict.fromkeys(text_lines))
+            for num in range(0, len(text_lines)):
+                json_article[num] = text_lines[num]
+            return json_article, len(json_article)
+        else:
+            raise ValueError(f"<article> tag not found in {self.article_url}")
diff --git a/audiobook/main.py b/audiobook/main.py
index 71fc06d..d6bb078 100644
--- a/audiobook/main.py
+++ b/audiobook/main.py
@@ -5,16 +5,21 @@
 import ebooklib
 from ebooklib import epub
 
+import logging
+logger = logging.getLogger("PyPDF2")
+logger.setLevel(logging.INFO)
+
 from audiobook.utils import response_to_text
 from audiobook.utils import speak_text
 from audiobook.utils import text_preprocessing
 from audiobook.utils import load_json
 from audiobook.utils import write_json_file
 
+from audiobook.article_web_scraper import ArticleWebScraper
+
 from audiobook.config import speed_dict
 from audiobook.config import supported_file_types
 
-import logging
 
 expand_usr = os.path.expanduser("~")
 BOOK_DIR = os.path.join(expand_usr, "audiobook/library")
@@ -23,7 +28,6 @@
 logger = logging.getLogger("PyPDF2")
 logger.setLevel(logging.INFO)
 
-
 class AudioBook:
     """
     AudioBook class
@@ -32,9 +36,12 @@ class AudioBook:
         file_check: checks if file exists
         pdf_to_json: converts pdf to json format
         create_json_book: Creates json book from input file by calling respective method
-        save_audio: saves audio files in folder
+        read_json: reads a json file
+        save_json_to_audio: save .mp3 audios from a json file in a folder
+        save_book_audio: saves audio files in folder
         read_book: reads the book
-
+        read_web_article: read web article from a given url
+        save_web_article_audio: save web article to a .mp3 file from a given url
     sample usage:
         ab = AudioBook(speed="normal")
         ab.read_book(file_path, password="abcd")
@@ -123,7 +130,7 @@ def create_json_book(self, input_file_path, password=None):
 
         return json_book, pages
 
-    def save_audio(self, input_file_path, password=None):
+   def save_audio(self, input_file_path, password=None):
         """ method to save audio files in folder """
         self.file_check(input_file_path)
 
@@ -145,7 +152,7 @@ def save_audio(self, input_file_path, password=None):
         for page_num, text in tqdm(json_book.items()):
             self.engine.save_to_file(text, os.path.join(book_name, book_name + "_page_" + (str(page_num)) + ".mp3"))
             self.engine.runAndWait()
-
+    
     def read_book(self, input_file_path, password=None):  # argument to be added, save_audio=False, save_json_book=False
         """ method to read the book """
         self.file_check(input_file_path)
@@ -195,3 +202,35 @@ def read_book(self, input_file_path, password=None):  # argument to be added, sa
             else:
                 user_input = input("Please Select an option: \n 1. Type 'r' to read again: \n 2. Type 'p' to read previous page\n 3. Type 'n' to read next page\n 4. Type 'q' to quit:\n 5. Type page number to read that page:\n")
                 continue
+
+
+    def save_json_to_audio(self, json, audio_name):
+        """ save json to a list of file in a folder having audio_name, one for each page """
+        os.makedirs(audio_name, exist_ok=True)
+        logger.info('Saving audio files in folder: {}'.format(audio_name))
+        for page_num, text in json.items():
+            self.engine.save_to_file(text, os.path.join(audio_name, audio_name + "_page_" + (str(page_num+1) + ".mp3")))
+            self.engine.runAndWait()
+
+    def read_web_article(self, article_url):
+        """ read web article from a article_url containing an <article> tag """
+        ws = ArticleWebScraper(article_url)
+        json_article, pages = ws.get_json_from_web_article()
+        if len(json_article) > 0:
+            self.read_json(json_article, pages, "article")
+        else:
+            raise ValueError("<article> tag has no text.")
+    
+    def save_web_article_audio(self, article_url):
+        """ save web article from a article_url containing an <article> tag """
+        ws = ArticleWebScraper(article_url)
+        json_article, _ = ws.get_json_from_web_article()
+        if len(json_article) > 0:
+            title = ws.get_title_from_article()
+            folder_name = input(f"Choose name for article \"{title}\". It will be stored in {os.getcwd()}\n")
+            self.save_json_to_audio(json_article, folder_name)
+        else:
+            raise ValueError("<article> tag is empty.")
+        
+
+        
diff --git a/requirements.txt b/requirements.txt
index db0ce0e..6556d01 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
 pyttsx3==2.90
 PyPDF2==2.11.1
 ebooklib==0.17.1
-beautifulsoup4==4.11.1
\ No newline at end of file
+beautifulsoup4==4.11.1
diff --git a/tests/test_audiobook.py b/tests/test_audiobook.py
index b5a403a..e8b8bc2 100644
--- a/tests/test_audiobook.py
+++ b/tests/test_audiobook.py
@@ -1,12 +1,21 @@
 from audiobook import AudioBook
 import unittest
 
-
 class TestAudioBook(unittest.TestCase):
     def test_invalidPathNumeric(self):
         with self.assertRaises(IOError):
-            ab = AudioBook()
-            ab.read_book(123)
+            ab = AudioBook('normal')
+            ab.txt_to_json(123)
+
+    def test_openDirectory(self):
+        with self.assertRaises(IsADirectoryError):
+            ab = AudioBook('normal')
+            ab.txt_to_json('/')
+
+    def test_fileDoesNotExist(self):
+        with self.assertRaises(FileNotFoundError):
+            ab = AudioBook('normal')
+            ab.txt_to_json('oiawhgaiurgieurghergerg')
 
     def test_openDirectory(self):
         with self.assertRaises(IsADirectoryError):
@@ -17,8 +26,3 @@ def test_fileDoesNotExist(self):
         with self.assertRaises(FileNotFoundError):
             ab = AudioBook()
             ab.read_book('oiawhgaiurgieurghergerg')
-
-    # def test_fileIsNotPDF(self):
-    #     with self.assertRaises(PdfReadError):
-    #         ab = AudioBook(__file__)
-    #         ab.text_to_speech()