From 1f85d99eedb8193e5fb9cfc20b9f50134fc6c452 Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 16:44:35 -0700 Subject: [PATCH 01/10] BUGFIX: Change to use new URL churchofjesuschrist.org --- conference_headers.json | 2 +- gen_conf_downloader.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/conference_headers.json b/conference_headers.json index c923598..060aea3 100644 --- a/conference_headers.json +++ b/conference_headers.json @@ -4,7 +4,7 @@ "Accept-Language": "en-US,en;q=0.5", "Cache-Control": "max-age=0", "Connection": "keep-alive", - "Host": "www.lds.org", + "Host": "www.churchofjesuschrist.org", "Upgrade-Insecure-Requests": "1", "User-Agent": "General Conference Downloader" } diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index 4938955..c199b9f 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -32,7 +32,7 @@ AUDIO_DUR = 'MP3' PLAYLIST_FILE_EXT = 'm3u' -LDS_ORG_URL = 'https://www.lds.org' +LDS_ORG_URL = 'https://www.churchofjesuschrist.org' ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences' GET_SESSION_TITLE_REGEX = '(.*?)' From a181655d21e08508401ea4911f6ea7064d29d8dc Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 16:46:04 -0700 Subject: [PATCH 02/10] BUGFIX: Address 'Problem with http request' --- gen_conf_downloader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index c199b9f..b785398 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -293,7 +293,7 @@ def get_from_cache(args, url): path = get_cache_filename(args, url) os.makedirs(os.path.dirname(path), exist_ok=True) if os.path.isfile(path): - with open(path, 'r') as f: + with open(path, 'r', encoding="utf-8") as f: return f.read() return None @@ -303,7 +303,7 @@ def add_to_cache(args, html, url): url = quote_plus(url) path = get_cache_filename(args, url) os.makedirs(os.path.dirname(path), exist_ok=True) - with open(path, 'w') as f: + with open(path, 'w', encoding="utf-8") as f: f.write(html) From effe0cbdcf26bcc346d27cbfba25bc1f37b4aafb Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 16:47:27 -0700 Subject: [PATCH 03/10] BUGFIX: Fix MP3 path retrieval issue due to changes in HTML by church --- gen_conf_downloader.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index b785398..3a00731 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -36,7 +36,8 @@ ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences' GET_SESSION_TITLE_REGEX = '(.*?)' -TALK_LINK_REGEX = '' +TALK_LINK_REGEX1 = '<\/header>.*?
Date: Sat, 9 Nov 2019 16:49:16 -0700 Subject: [PATCH 04/10] BUGFIX: Fix title characters causing invalid MP3 filenames --- gen_conf_downloader.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index 3a00731..1febd58 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -98,7 +98,7 @@ def get_conference_season(args, playlist_dirs, season): def get_session(args, playlist_dirs, session): talk_summaries = get_talk_summary_details(session.html) - talks = [Talk(decode(talk[0]), talk[2], talk[1], session) for talk in talk_summaries] + talks = [Talk(decode(talk[0]), talk[2], get_filename_from_talk_title(talk[1]), session) for talk in talk_summaries] with tqdm(total=len(talks)) as progress_bar: for talk in talks: @@ -129,6 +129,9 @@ def get_talk(args, playlist_dirs, talk): update_playlists(args, playlist_dirs, talk, filename_mp3, topics, duration) increment_counts(talk.speaker, topics, duration) +def get_filename_from_talk_title(talk_title): + keepcharacters = (' ','.','_') + return "".join(c for c in talk_title if c.isalnum() or c in keepcharacters).rstrip() def get_mp3_filepath(year, month_text, session_lable_text, title_text, name_text): return f'mp3/{year}/{month_text}/{session_lable_text}/' \ From 0d166dc206e9f72c6737f10756ec23eb70f9e1da Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 16:50:44 -0700 Subject: [PATCH 05/10] FEATURE: Add session and talk numbering to improve play order when not using playlists --- gen_conf_downloader.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index 1febd58..27ae521 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -17,8 +17,8 @@ from tqdm import tqdm Season = namedtuple('Season', 'link year month title') -Session = namedtuple('Session', 'html title season') -Talk = namedtuple('Talk', 'link speaker title session') +Session = namedtuple('Session', 'html title number season') +Talk = namedtuple('Talk', 'link speaker title number session') speakers_num = defaultdict(int) topics_num = defaultdict(int) @@ -84,10 +84,12 @@ def get_conference_season(args, playlist_dirs, season): session_htmls = season_html.split(SESSION_SPLITTER) sessions = list() + session_number = 10 for session_html in session_htmls: session_title_results = re.findall(GET_SESSION_TITLE_REGEX, session_html) if session_title_results: - sessions.append(Session(session_html, session_title_results[0], season)) + sessions.append(Session(session_html, str(session_number) + "-" + session_title_results[0], session_number, season)) + session_number += 10 with tqdm(total=len(sessions)) as progress_bar: for session in sessions: @@ -98,7 +100,7 @@ def get_conference_season(args, playlist_dirs, season): def get_session(args, playlist_dirs, session): talk_summaries = get_talk_summary_details(session.html) - talks = [Talk(decode(talk[0]), talk[2], get_filename_from_talk_title(talk[1]), session) for talk in talk_summaries] + talks = [Talk(decode(talk[0]), talk[2], get_filename_from_talk_title(talk[1]), session.number + num, session) for num, talk in enumerate(talk_summaries, start=1)] with tqdm(total=len(talks)) as progress_bar: for talk in talks: @@ -122,7 +124,7 @@ def get_talk(args, playlist_dirs, talk): topics = [to_camel_case(topic) for topic in topics] filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.title}/' \ - f'{talk.title} ({talk.speaker}).mp3' + f'{talk.number} {talk.title} ({talk.speaker}).mp3' output_mp3_filepath = get_mp3(args, link_mp3, filename_mp3) duration = int(MP3(output_mp3_filepath).info.length) From fd5f380339ac5bb2901acfff28eedff442a7cf46 Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 17:26:05 -0700 Subject: [PATCH 06/10] BUGFIX: Better regular expression for Talk MP3 link retrieval --- gen_conf_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index 27ae521..0433d83 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -36,7 +36,7 @@ ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences' GET_SESSION_TITLE_REGEX = '(.*?)' -TALK_LINK_REGEX1 = '<\/header> Date: Sat, 9 Nov 2019 17:49:27 -0700 Subject: [PATCH 07/10] BUGFIX: Change remaining lds.org references to churchofjesuschrist.org --- README.md | 25 ++++++++++++++----------- gen_conf_downloader.py | 2 +- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1211183..4254063 100644 --- a/README.md +++ b/README.md @@ -8,14 +8,14 @@ This script is ideal for: - *Anyone* who wishes to study from the conference talks but doesn't have a reliable internet connection. ## What it can do? -This script will allow you to download the LDS General Conference talks in mp3 form that are available at https://www.lds.org/general-conference. +This script will allow you to download the LDS General Conference talks in mp3 form that are available at https://www.churchofjesuschrist.org/general-conference. It will create *playlists* as *.m3u files to allow you to play an *entire session*. It will also create playlists for *speakers* and *topcs*. This will not only work with the default English versions, but also for *every other language* for which audio files are available. Currently, hundreds of talks are available in many languages, going back as far as 1971 for some. ## How does it work? -It will programmatically navigate the lds.org website, downloading and organising every talk of interest to you. +It will programmatically navigate the churchofjesuschrist.org website, downloading and organising every talk of interest to you. Everything will be saved to a local folder of your choice. Load these files onto a *memory stick* for your car, or into your *favourite media player*. @@ -31,9 +31,9 @@ Load these files onto a *memory stick* for your car, or into your *favourite med |Argument|Values|Meaning| |--------|------|-------| |`-h` or `--help`| |List all arguments and exit| -|`-l` or `-lang`| 3-letter language code|Indicates which language version is to be downloaded. See https://www.lds.org/languages for full list. Click on the language you want, then take note of the 3-letter code in the address bar. i.e. https://www.lds.org/?lang=*spa*| +|`-l` or `-lang`| 3-letter language code|Indicates which language version is to be downloaded. See https://www.churchofjesuschrist.org/languages for full list. Click on the language you want, then take note of the 3-letter code in the address bar. i.e. https://www.churchofjesuschrist.org/?lang=*spa*| |`-s` or `-start`|Year as 4 digit number|First year of conference to download. Defaults to 1971. _Note: not all historic sessions are available in all languages_| -|`-e` or `-end`|Year as 4 digit number|Last year to download (defaults to present year).| +|`-e` or `-end`|Year as 4 digit number|Last year to download (defaults to 2100).| |`-d` or `-dest`|folder relative to here. i.e. `./conference`|Destination folder to output files to. Defaults to `output`| |`-n` or `-nocleanup`| |Leaves temporary files after process completion.| |`-v` or `-verbose`| |Provides detailed activity logging instead of progress bars.| @@ -48,8 +48,9 @@ output └───Conferences │ └───2018 │ │ └───4 - │ │ │ Priesthood Session.m3u - │ │ │ Saturday Morning Session.m3u + │ │ │ 10-Saturday Morning Session.m3u + │ │ │ 20-Saturday Afternoon Session.m3u + │ │ │ 30-Priesthood Session.m3u │ │ │ ... │ │ └───10 │ │ ... @@ -58,11 +59,13 @@ output └───MP3 │ └───2018 │ │ └───4 - │ │ │ └───Priesthood Session - │ │ │ │ Am I a Child of God? (Brian K. Taylor).mp3 - │ │ │ │ Even as Christ Forgives You, So Also Do Ye (Larry J. Echo Hawk).mp3 + │ │ │ └───10-Saturday Morning Session │ │ │ │ ... - │ │ │ └───Saturday Morning Session + │ │ │ └───20-Saturday Afternoon Session + │ │ │ │ ... + │ │ │ └───30-Priesthood Session + │ │ │ │ 31 Am I a Child of God? (Brian K. Taylor).mp3 + │ │ │ │ 32 Even as Christ Forgives You, So Also Do Ye (Larry J. Echo Hawk).mp3 │ │ │ │ ... │ │ └───10 │ └───2017 @@ -76,4 +79,4 @@ output Atonement(6, 1h4m).m3u ... ``` -The playlists for the Topics and Speakers include in parenthesis the number of talks and the total duration. \ No newline at end of file +The playlists for the Topics and Speakers include in parenthesis the number of talks and the total duration. diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index 0433d83..e13b86c 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -389,7 +389,7 @@ def update(self, n=1): parser = argparse.ArgumentParser(description='Download language specific LDS General Conference MP3s, ' 'creating playlists for each conference, speaker and topic.') parser.add_argument('-lang', help='Language version to download. ' - 'See https://www.lds.org/languages for full list.', default='eng') + 'See https://www.churchofjesuschrist.org/languages for full list.', default='eng') parser.add_argument('-start', type=int, help='First year to download. ' 'Note: not all historic sessions are available in all languages', default=1971) From d7d43fb02fbdf3ded91816d8a0e3f4229a0eecdf Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 18:51:38 -0700 Subject: [PATCH 08/10] FEATURE: Allow for disabling new session and talk numbering system (enabled by default) --- gen_conf_downloader.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index e13b86c..c5bb845 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -88,7 +88,7 @@ def get_conference_season(args, playlist_dirs, season): for session_html in session_htmls: session_title_results = re.findall(GET_SESSION_TITLE_REGEX, session_html) if session_title_results: - sessions.append(Session(session_html, str(session_number) + "-" + session_title_results[0], session_number, season)) + sessions.append(Session(session_html, session_title_results[0], session_number, season)) session_number += 10 with tqdm(total=len(sessions)) as progress_bar: @@ -123,8 +123,12 @@ def get_talk(args, playlist_dirs, talk): topics = re.findall(TALK_TOPIC_REGEX, talk_html) topics = [to_camel_case(topic) for topic in topics] - filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.title}/' \ - f'{talk.number} {talk.title} ({talk.speaker}).mp3' + if args.nonumbers: + filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.title}/' \ + f'{talk.title} ({talk.speaker}).mp3' + else: + filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.number}-{talk.session.title}/' \ + f'{talk.number} {talk.title} ({talk.speaker}).mp3' output_mp3_filepath = get_mp3(args, link_mp3, filename_mp3) duration = int(MP3(output_mp3_filepath).info.length) @@ -398,6 +402,7 @@ def update(self, n=1): parser.add_argument('-nocleanup', help='Leaves temporary files after process completion.', action="store_true") parser.add_argument('-verbose', help='Provides detailed activity logging instead of progress bars.', action="store_true") + parser.add_argument('-nonumbers', help='Excludes generated session and talk numbers from file and directory names.', action="store_true") cli_args = parser.parse_args() From e1aaa3bd7552df11f6d3eedb24ac59e81a030750 Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 9 Nov 2019 19:12:39 -0700 Subject: [PATCH 09/10] FEATURE: Document new -nonumbers command line argument --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4254063..5792a69 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,7 @@ Load these files onto a *memory stick* for your car, or into your *favourite med |`-d` or `-dest`|folder relative to here. i.e. `./conference`|Destination folder to output files to. Defaults to `output`| |`-n` or `-nocleanup`| |Leaves temporary files after process completion.| |`-v` or `-verbose`| |Provides detailed activity logging instead of progress bars.| +|`-nonumbers`| |Excludes generated session and talk numbers from file and directory names.| _Note: Depending upon how many years worth of conferences you ask it to download, it may take some time!_ From 0732128c4ca9a4498469336ecf70a9e931706fe4 Mon Sep 17 00:00:00 2001 From: Ryan Lindeman Date: Sat, 30 May 2020 08:14:18 -0600 Subject: [PATCH 10/10] Additional fixes for recent website changes in 2020 --- gen_conf_downloader.py | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py index c5bb845..36a20f8 100644 --- a/gen_conf_downloader.py +++ b/gen_conf_downloader.py @@ -3,11 +3,13 @@ """ import html as html_tools +import io import json import os import re import shutil import sys +from html.parser import HTMLParser from urllib.parse import unquote_plus from urllib.parse import quote_plus import urllib.request @@ -36,8 +38,7 @@ ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences' GET_SESSION_TITLE_REGEX = '(.*?)' -TALK_LINK_REGEX1 = '.*?