From f8b85b0c75c94e8d2889cec7bbed173e841e59c7 Mon Sep 17 00:00:00 2001 From: Giovanni Fiordeponti <38134891+giovf96@users.noreply.github.com> Date: Sun, 16 Oct 2022 16:08:02 +0200 Subject: [PATCH] Feature: Add support to read/save web articles(audio) (#20) * Add article_web_scraper.py * Fix test * Fix file name * Remove html_text_formattings from main * Remove unused imports * Refactor article_web_scraper * Add support for pagination in read/store article * Fix logging Co-authored-by: Deepak Raj <54245038+codePerfectPlus@users.noreply.github.com> --- audiobook/__init__.py | 2 +- audiobook/article_web_scraper.py | 50 +++++++++++++++++++++++++++++++ audiobook/main.py | 51 ++++++++++++++++++++++++++++---- requirements.txt | 2 +- tests/test_audiobook.py | 20 ++++++++----- 5 files changed, 109 insertions(+), 16 deletions(-) create mode 100644 audiobook/article_web_scraper.py diff --git a/audiobook/__init__.py b/audiobook/__init__.py index b2a3a40..86c95fc 100644 --- a/audiobook/__init__.py +++ b/audiobook/__init__.py @@ -1 +1 @@ -from audiobook.main import AudioBook +from audiobook.main import AudioBook \ No newline at end of file diff --git a/audiobook/article_web_scraper.py b/audiobook/article_web_scraper.py new file mode 100644 index 0000000..13a990f --- /dev/null +++ b/audiobook/article_web_scraper.py @@ -0,0 +1,50 @@ +import requests + +from bs4 import BeautifulSoup + +html_text_formattings = ["p", "a", "b", "strong", "i", "em", "mark", "small", "del", "ins", "sub", "sup"] + +class ArticleWebScraper: + """ + ArticleWebScraper class + + methods: + get_json_from_web_article: returns a json from a non-empty
tag + get_title_from_article: returns the tag from the html page + + sample usage: + ab = AudioBook(speed="normal") + ab.read_book(file_path, password="abcd") + """ + + def __init__(self, article_url): + page = requests.get(article_url) + self.article_url = article_url + self.soup = BeautifulSoup(page.content, "html.parser") + + def get_title_from_article (self): + """ returns the <title> tag from the html page """ + return self.soup.title.text + + def get_json_from_web_article (self): + """ returns a json from a non-empty <article> tag """ + if hasattr(self.soup, 'article') and self.soup.article is not None: + article_text_tag_items = [ + self.soup.article.findChildren(text_formatting , recursive=True) + for text_formatting in html_text_formattings + ] + + json_article = {} + text_lines = [] + # list(dict.fromkeys(lines))) removes duplicate words in same tag type + for article_text_tag_item in article_text_tag_items: + for article_text_tag in article_text_tag_item: + text_line = list(dict.fromkeys([tag.string for tag in article_text_tag if tag.string is not None])) + text_lines += text_line + # list(dict.fromkeys(lines))) removes duplicate words among all tags + text_lines = list(dict.fromkeys(text_lines)) + for num in range(0, len(text_lines)): + json_article[num] = text_lines[num] + return json_article, len(json_article) + else: + raise ValueError(f"<article> tag not found in {self.article_url}") diff --git a/audiobook/main.py b/audiobook/main.py index 71fc06d..d6bb078 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -5,16 +5,21 @@ import ebooklib from ebooklib import epub +import logging +logger = logging.getLogger("PyPDF2") +logger.setLevel(logging.INFO) + from audiobook.utils import response_to_text from audiobook.utils import speak_text from audiobook.utils import text_preprocessing from audiobook.utils import load_json from audiobook.utils import write_json_file +from audiobook.article_web_scraper import ArticleWebScraper + from audiobook.config import speed_dict from audiobook.config import supported_file_types -import logging expand_usr = os.path.expanduser("~") BOOK_DIR = os.path.join(expand_usr, "audiobook/library") @@ -23,7 +28,6 @@ logger = logging.getLogger("PyPDF2") logger.setLevel(logging.INFO) - class AudioBook: """ AudioBook class @@ -32,9 +36,12 @@ class AudioBook: file_check: checks if file exists pdf_to_json: converts pdf to json format create_json_book: Creates json book from input file by calling respective method - save_audio: saves audio files in folder + read_json: reads a json file + save_json_to_audio: save .mp3 audios from a json file in a folder + save_book_audio: saves audio files in folder read_book: reads the book - + read_web_article: read web article from a given url + save_web_article_audio: save web article to a .mp3 file from a given url sample usage: ab = AudioBook(speed="normal") ab.read_book(file_path, password="abcd") @@ -123,7 +130,7 @@ def create_json_book(self, input_file_path, password=None): return json_book, pages - def save_audio(self, input_file_path, password=None): + def save_audio(self, input_file_path, password=None): """ method to save audio files in folder """ self.file_check(input_file_path) @@ -145,7 +152,7 @@ def save_audio(self, input_file_path, password=None): for page_num, text in tqdm(json_book.items()): self.engine.save_to_file(text, os.path.join(book_name, book_name + "_page_" + (str(page_num)) + ".mp3")) self.engine.runAndWait() - + def read_book(self, input_file_path, password=None): # argument to be added, save_audio=False, save_json_book=False """ method to read the book """ self.file_check(input_file_path) @@ -195,3 +202,35 @@ def read_book(self, input_file_path, password=None): # argument to be added, sa else: user_input = input("Please Select an option: \n 1. Type 'r' to read again: \n 2. Type 'p' to read previous page\n 3. Type 'n' to read next page\n 4. Type 'q' to quit:\n 5. Type page number to read that page:\n") continue + + + def save_json_to_audio(self, json, audio_name): + """ save json to a list of file in a folder having audio_name, one for each page """ + os.makedirs(audio_name, exist_ok=True) + logger.info('Saving audio files in folder: {}'.format(audio_name)) + for page_num, text in json.items(): + self.engine.save_to_file(text, os.path.join(audio_name, audio_name + "_page_" + (str(page_num+1) + ".mp3"))) + self.engine.runAndWait() + + def read_web_article(self, article_url): + """ read web article from a article_url containing an <article> tag """ + ws = ArticleWebScraper(article_url) + json_article, pages = ws.get_json_from_web_article() + if len(json_article) > 0: + self.read_json(json_article, pages, "article") + else: + raise ValueError("<article> tag has no text.") + + def save_web_article_audio(self, article_url): + """ save web article from a article_url containing an <article> tag """ + ws = ArticleWebScraper(article_url) + json_article, _ = ws.get_json_from_web_article() + if len(json_article) > 0: + title = ws.get_title_from_article() + folder_name = input(f"Choose name for article \"{title}\". It will be stored in {os.getcwd()}\n") + self.save_json_to_audio(json_article, folder_name) + else: + raise ValueError("<article> tag is empty.") + + + diff --git a/requirements.txt b/requirements.txt index db0ce0e..6556d01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ pyttsx3==2.90 PyPDF2==2.11.1 ebooklib==0.17.1 -beautifulsoup4==4.11.1 \ No newline at end of file +beautifulsoup4==4.11.1 diff --git a/tests/test_audiobook.py b/tests/test_audiobook.py index b5a403a..e8b8bc2 100644 --- a/tests/test_audiobook.py +++ b/tests/test_audiobook.py @@ -1,12 +1,21 @@ from audiobook import AudioBook import unittest - class TestAudioBook(unittest.TestCase): def test_invalidPathNumeric(self): with self.assertRaises(IOError): - ab = AudioBook() - ab.read_book(123) + ab = AudioBook('normal') + ab.txt_to_json(123) + + def test_openDirectory(self): + with self.assertRaises(IsADirectoryError): + ab = AudioBook('normal') + ab.txt_to_json('/') + + def test_fileDoesNotExist(self): + with self.assertRaises(FileNotFoundError): + ab = AudioBook('normal') + ab.txt_to_json('oiawhgaiurgieurghergerg') def test_openDirectory(self): with self.assertRaises(IsADirectoryError): @@ -17,8 +26,3 @@ def test_fileDoesNotExist(self): with self.assertRaises(FileNotFoundError): ab = AudioBook() ab.read_book('oiawhgaiurgieurghergerg') - - # def test_fileIsNotPDF(self): - # with self.assertRaises(PdfReadError): - # ab = AudioBook(__file__) - # ab.text_to_speech()