Skip to content

Commit

Permalink
Feature: Add support to read/save web articles(audio) (#20)
Browse files Browse the repository at this point in the history
* Add article_web_scraper.py

* Fix test

* Fix file name

* Remove html_text_formattings from main

* Remove unused imports

* Refactor article_web_scraper

* Add support for pagination in read/store article

* Fix logging

Co-authored-by: Deepak Raj <[email protected]>
  • Loading branch information
flower-of-the-bridges and codeperfectplus authored Oct 16, 2022
1 parent 72922a1 commit f8b85b0
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 16 deletions.
2 changes: 1 addition & 1 deletion audiobook/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from audiobook.main import AudioBook
from audiobook.main import AudioBook
50 changes: 50 additions & 0 deletions audiobook/article_web_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import requests

from bs4 import BeautifulSoup

html_text_formattings = ["p", "a", "b", "strong", "i", "em", "mark", "small", "del", "ins", "sub", "sup"]

class ArticleWebScraper:
"""
ArticleWebScraper class
methods:
get_json_from_web_article: returns a json from a non-empty <article> tag
get_title_from_article: returns the <title> tag from the html page
sample usage:
ab = AudioBook(speed="normal")
ab.read_book(file_path, password="abcd")
"""

def __init__(self, article_url):
page = requests.get(article_url)
self.article_url = article_url
self.soup = BeautifulSoup(page.content, "html.parser")

def get_title_from_article (self):
""" returns the <title> tag from the html page """
return self.soup.title.text

def get_json_from_web_article (self):
""" returns a json from a non-empty <article> tag """
if hasattr(self.soup, 'article') and self.soup.article is not None:
article_text_tag_items = [
self.soup.article.findChildren(text_formatting , recursive=True)
for text_formatting in html_text_formattings
]

json_article = {}
text_lines = []
# list(dict.fromkeys(lines))) removes duplicate words in same tag type
for article_text_tag_item in article_text_tag_items:
for article_text_tag in article_text_tag_item:
text_line = list(dict.fromkeys([tag.string for tag in article_text_tag if tag.string is not None]))
text_lines += text_line
# list(dict.fromkeys(lines))) removes duplicate words among all tags
text_lines = list(dict.fromkeys(text_lines))
for num in range(0, len(text_lines)):
json_article[num] = text_lines[num]
return json_article, len(json_article)
else:
raise ValueError(f"<article> tag not found in {self.article_url}")
51 changes: 45 additions & 6 deletions audiobook/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@
import ebooklib
from ebooklib import epub

import logging
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.INFO)

from audiobook.utils import response_to_text
from audiobook.utils import speak_text
from audiobook.utils import text_preprocessing
from audiobook.utils import load_json
from audiobook.utils import write_json_file

from audiobook.article_web_scraper import ArticleWebScraper

from audiobook.config import speed_dict
from audiobook.config import supported_file_types

import logging

expand_usr = os.path.expanduser("~")
BOOK_DIR = os.path.join(expand_usr, "audiobook/library")
Expand All @@ -23,7 +28,6 @@
logger = logging.getLogger("PyPDF2")
logger.setLevel(logging.INFO)


class AudioBook:
"""
AudioBook class
Expand All @@ -32,9 +36,12 @@ class AudioBook:
file_check: checks if file exists
pdf_to_json: converts pdf to json format
create_json_book: Creates json book from input file by calling respective method
save_audio: saves audio files in folder
read_json: reads a json file
save_json_to_audio: save .mp3 audios from a json file in a folder
save_book_audio: saves audio files in folder
read_book: reads the book
read_web_article: read web article from a given url
save_web_article_audio: save web article to a .mp3 file from a given url
sample usage:
ab = AudioBook(speed="normal")
ab.read_book(file_path, password="abcd")
Expand Down Expand Up @@ -123,7 +130,7 @@ def create_json_book(self, input_file_path, password=None):

return json_book, pages

def save_audio(self, input_file_path, password=None):
def save_audio(self, input_file_path, password=None):
""" method to save audio files in folder """
self.file_check(input_file_path)

Expand All @@ -145,7 +152,7 @@ def save_audio(self, input_file_path, password=None):
for page_num, text in tqdm(json_book.items()):
self.engine.save_to_file(text, os.path.join(book_name, book_name + "_page_" + (str(page_num)) + ".mp3"))
self.engine.runAndWait()

def read_book(self, input_file_path, password=None): # argument to be added, save_audio=False, save_json_book=False
""" method to read the book """
self.file_check(input_file_path)
Expand Down Expand Up @@ -195,3 +202,35 @@ def read_book(self, input_file_path, password=None): # argument to be added, sa
else:
user_input = input("Please Select an option: \n 1. Type 'r' to read again: \n 2. Type 'p' to read previous page\n 3. Type 'n' to read next page\n 4. Type 'q' to quit:\n 5. Type page number to read that page:\n")
continue


def save_json_to_audio(self, json, audio_name):
""" save json to a list of file in a folder having audio_name, one for each page """
os.makedirs(audio_name, exist_ok=True)
logger.info('Saving audio files in folder: {}'.format(audio_name))
for page_num, text in json.items():
self.engine.save_to_file(text, os.path.join(audio_name, audio_name + "_page_" + (str(page_num+1) + ".mp3")))
self.engine.runAndWait()

def read_web_article(self, article_url):
""" read web article from a article_url containing an <article> tag """
ws = ArticleWebScraper(article_url)
json_article, pages = ws.get_json_from_web_article()
if len(json_article) > 0:
self.read_json(json_article, pages, "article")
else:
raise ValueError("<article> tag has no text.")

def save_web_article_audio(self, article_url):
""" save web article from a article_url containing an <article> tag """
ws = ArticleWebScraper(article_url)
json_article, _ = ws.get_json_from_web_article()
if len(json_article) > 0:
title = ws.get_title_from_article()
folder_name = input(f"Choose name for article \"{title}\". It will be stored in {os.getcwd()}\n")
self.save_json_to_audio(json_article, folder_name)
else:
raise ValueError("<article> tag is empty.")



2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
pyttsx3==2.90
PyPDF2==2.11.1
ebooklib==0.17.1
beautifulsoup4==4.11.1
beautifulsoup4==4.11.1
20 changes: 12 additions & 8 deletions tests/test_audiobook.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
from audiobook import AudioBook
import unittest


class TestAudioBook(unittest.TestCase):
def test_invalidPathNumeric(self):
with self.assertRaises(IOError):
ab = AudioBook()
ab.read_book(123)
ab = AudioBook('normal')
ab.txt_to_json(123)

def test_openDirectory(self):
with self.assertRaises(IsADirectoryError):
ab = AudioBook('normal')
ab.txt_to_json('/')

def test_fileDoesNotExist(self):
with self.assertRaises(FileNotFoundError):
ab = AudioBook('normal')
ab.txt_to_json('oiawhgaiurgieurghergerg')

def test_openDirectory(self):
with self.assertRaises(IsADirectoryError):
Expand All @@ -17,8 +26,3 @@ def test_fileDoesNotExist(self):
with self.assertRaises(FileNotFoundError):
ab = AudioBook()
ab.read_book('oiawhgaiurgieurghergerg')

# def test_fileIsNotPDF(self):
# with self.assertRaises(PdfReadError):
# ab = AudioBook(__file__)
# ab.text_to_speech()

0 comments on commit f8b85b0

Please sign in to comment.