simmeringratchet · GatorQue · Nov 9, 2019 · Nov 9, 2019 · Nov 9, 2019 · Nov 9, 2019
diff --git a/README.md b/README.md
@@ -8,14 +8,14 @@ This script is ideal for:
  - *Anyone* who wishes to study from the conference talks but doesn't have a reliable internet connection.
 
 ## What it can do?
-This script will allow you to download the LDS General Conference talks in mp3 form that are available at https://www.lds.org/general-conference.
+This script will allow you to download the LDS General Conference talks in mp3 form that are available at https://www.churchofjesuschrist.org/general-conference.
 It will create *playlists* as *.m3u files to allow you to play an *entire session*. 
 It will also create playlists for *speakers* and *topcs*.
 This will not only work with the default English versions, but also for *every other language* for which audio files are available.
 Currently, hundreds of talks are available in many languages, going back as far as 1971 for some.
 
 ## How does it work?
-It will programmatically navigate the lds.org website, downloading and organising every talk of interest to you.
+It will programmatically navigate the churchofjesuschrist.org website, downloading and organising every talk of interest to you.
 Everything will be saved to a local folder of your choice. 
 Load these files onto a *memory stick* for your car, or into your *favourite media player*. 
 
@@ -31,12 +31,13 @@ Load these files onto a *memory stick* for your car, or into your *favourite med
 |Argument|Values|Meaning|
 |--------|------|-------|
 |`-h` or `--help`| |List all arguments and exit|
-|`-l` or `-lang`| 3-letter language code|Indicates which language version is to be downloaded. See https://www.lds.org/languages for full list. Click on the language you want, then take note of the 3-letter code in the address bar. i.e. https://www.lds.org/?lang=*spa*|
+|`-l` or `-lang`| 3-letter language code|Indicates which language version is to be downloaded. See https://www.churchofjesuschrist.org/languages for full list. Click on the language you want, then take note of the 3-letter code in the address bar. i.e. https://www.churchofjesuschrist.org/?lang=*spa*|
 |`-s` or `-start`|Year as 4 digit number|First year of conference to download. Defaults to 1971. _Note: not all historic sessions are available in all languages_|
-|`-e` or `-end`|Year as 4 digit number|Last year to download (defaults to present year).|
+|`-e` or `-end`|Year as 4 digit number|Last year to download (defaults to 2100).|
 |`-d` or `-dest`|folder relative to here. i.e. `./conference`|Destination folder to output files to. Defaults to `output`|
 |`-n` or `-nocleanup`| |Leaves temporary files after process completion.|
 |`-v` or `-verbose`| |Provides detailed activity logging instead of progress bars.|
+|`-nonumbers`| |Excludes generated session and talk numbers from file and directory names.|
 
  _Note: Depending upon how many years worth of conferences you ask it to download, it may take some time!_
 
@@ -48,8 +49,9 @@ output
     └───Conferences
     │   └───2018
     │   │   └───4
-    │   │   │   Priesthood Session.m3u
-    │   │   │   Saturday Morning Session.m3u
+    │   │   │   10-Saturday Morning Session.m3u
+    │   │   │   20-Saturday Afternoon Session.m3u
+    │   │   │   30-Priesthood Session.m3u
     │   │   │   ...
     │   │   └───10
     │   │       ...
@@ -58,11 +60,13 @@ output
     └───MP3
     │   └───2018
     │   │   └───4
-    │   │   │   └───Priesthood Session
-    │   │   │   │   Am I a Child of God? (Brian K. Taylor).mp3
-    │   │   │   │   Even as Christ Forgives You, So Also Do Ye (Larry J. Echo Hawk).mp3
+    │   │   │   └───10-Saturday Morning Session
     │   │   │   │   ...
-    │   │   │   └───Saturday Morning Session
+    │   │   │   └───20-Saturday Afternoon Session
+    │   │   │   │   ...
+    │   │   │   └───30-Priesthood Session
+    │   │   │   │   31 Am I a Child of God? (Brian K. Taylor).mp3
+    │   │   │   │   32 Even as Christ Forgives You, So Also Do Ye (Larry J. Echo Hawk).mp3
     │   │   │   │   ...
     │   │   └───10
     │   └───2017
@@ -76,4 +80,4 @@ output
         Atonement(6, 1h4m).m3u
         ...
 ```        
-The playlists for the Topics and Speakers include in parenthesis the number of talks and the total duration.
+The playlists for the Topics and Speakers include in parenthesis the number of talks and the total duration.
diff --git a/conference_headers.json b/conference_headers.json
@@ -4,7 +4,7 @@
   "Accept-Language": "en-US,en;q=0.5",
   "Cache-Control": "max-age=0",
   "Connection": "keep-alive",
-  "Host": "www.lds.org",
+  "Host": "www.churchofjesuschrist.org",
   "Upgrade-Insecure-Requests": "1",
   "User-Agent": "General Conference Downloader"
 }
diff --git a/gen_conf_downloader.py b/gen_conf_downloader.py
@@ -3,11 +3,13 @@
 """
 
 import html as html_tools
+import io
 import json
 import os
 import re
 import shutil
 import sys
+from html.parser import HTMLParser
 from urllib.parse import unquote_plus
 from urllib.parse import quote_plus
 import urllib.request
@@ -17,8 +19,8 @@
 from tqdm import tqdm
 
 Season = namedtuple('Season', 'link year month title')
-Session = namedtuple('Session', 'html title season')
-Talk = namedtuple('Talk', 'link speaker title session')
+Session = namedtuple('Session', 'html title number season')
+Talk = namedtuple('Talk', 'link speaker title number session')
 
 speakers_num = defaultdict(int)
 topics_num = defaultdict(int)
@@ -32,11 +34,11 @@
 AUDIO_DUR = 'MP3'
 PLAYLIST_FILE_EXT = 'm3u'
 
-LDS_ORG_URL = 'https://www.lds.org'
+LDS_ORG_URL = 'https://www.churchofjesuschrist.org'
 ALL_CONFERENCES_URL = f'{LDS_ORG_URL}/general-conference/conferences'
 
 GET_SESSION_TITLE_REGEX = '<span class=\"section__header__title\">(.*?)</span>'
-TALK_LINK_REGEX = '<source src=\"(.*?.mp3)\">'
+TALK_LINK_REGEX = '<a href=\"([^"]*.mp3.*)\" .*This Page \(MP3\)'
 TALK_TOPIC_REGEX = '<div class=\"drawerList tab\" data-title=\"(.*?)\">'
 GET_TALK_LINKS_FROM_SESSION_SECTION_REGEX = '<div class=\"lumen-tile lumen-tile--horizontal lumen-tile--list\">.*?' \
                                             '<a href=\"(.*?)\" class=\"lumen-tile__link\">.*?<div ' \
@@ -51,6 +53,19 @@
 SESSION_SPLITTER = 'section tile-wrapper layout--3 lumen-layout__item'
 
 
+class MLStripper(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = io.StringIO()
+    def handle_data(self, d):
+        self.text.write(d)
+    def get_data(self):
+        return self.text.getvalue()
+
+
 def get_all_conferences_seasons(args):
     all_seasons_html = get(args, f'{ALL_CONFERENCES_URL}?lang={args.lang}')
     playlist_dirs = re.findall(GET_SECTION_TERMS_REGEX, all_seasons_html, re.S)
@@ -83,10 +98,12 @@ def get_conference_season(args, playlist_dirs, season):
     session_htmls = season_html.split(SESSION_SPLITTER)
 
     sessions = list()
+    session_number = 10
     for session_html in session_htmls:
         session_title_results = re.findall(GET_SESSION_TITLE_REGEX, session_html)
         if session_title_results:
-            sessions.append(Session(session_html, session_title_results[0], season))
+            sessions.append(Session(session_html, clean_session_title(session_title_results[0]), session_number, season))
+            session_number += 10
 
     with tqdm(total=len(sessions)) as progress_bar:
         for session in sessions:
@@ -97,7 +114,7 @@ def get_conference_season(args, playlist_dirs, season):
 
 def get_session(args, playlist_dirs, session):
     talk_summaries = get_talk_summary_details(session.html)
-    talks = [Talk(decode(talk[0]), talk[2], talk[1], session) for talk in talk_summaries]
+    talks = [Talk(decode(talk[0]), talk[2], get_filename_from_talk_title(talk[1]), session.number + num, session) for num, talk in enumerate(talk_summaries, start=1)]
 
     with tqdm(total=len(talks)) as progress_bar:
         for talk in talks:
@@ -111,21 +128,38 @@ def get_talk(args, playlist_dirs, talk):
 
     mp3_link_result = re.findall(TALK_LINK_REGEX, talk_html)
     if not mp3_link_result:
+        print("Unable to determine MP3 link for {} at '{}'".format(talk.session.title, talk.link))
         return
     link_mp3 = mp3_link_result[0]
 
     topics = re.findall(TALK_TOPIC_REGEX, talk_html)
     topics = [to_camel_case(topic) for topic in topics]
 
-    filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.title}/' \
-                   f'{talk.title} ({talk.speaker}).mp3'
+    if args.nonumbers:
+        filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.title}/' \
+                       f'{talk.title} ({talk.speaker}).mp3'
+    else:
+        filename_mp3 = f'{AUDIO_DUR}/{talk.session.season.year}/{talk.session.season.month}/{talk.session.number}-{talk.session.title}/' \
+                       f'{talk.number} {talk.title} ({talk.speaker}).mp3'
     output_mp3_filepath = get_mp3(args, link_mp3, filename_mp3)
     duration = int(MP3(output_mp3_filepath).info.length)
 
     update_playlists(args, playlist_dirs, talk, filename_mp3, topics, duration)
     increment_counts(talk.speaker, topics, duration)
 
 
+def clean_session_title(session_title):
+    s = MLStripper()
+    s.feed(session_title)
+    keepcharacters = (' ')
+    return "".join(c for c in s.get_data() if c.isalnum() or c in keepcharacters).rstrip()
+
+
+def get_filename_from_talk_title(talk_title):
+    keepcharacters = (' ','.','_')
+    return "".join(c for c in talk_title if c.isalnum() or c in keepcharacters).rstrip()
+
+
 def get_mp3_filepath(year, month_text, session_lable_text, title_text, name_text):
     return f'mp3/{year}/{month_text}/{session_lable_text}/' \
            f'{year} {month_text}, {session_lable_text}, {title_text} ({name_text}).mp3'
@@ -293,7 +327,7 @@ def get_from_cache(args, url):
     path = get_cache_filename(args, url)
     os.makedirs(os.path.dirname(path), exist_ok=True)
     if os.path.isfile(path):
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding="utf-8") as f:
             return f.read()
     return None
 
@@ -303,7 +337,7 @@ def add_to_cache(args, html, url):
     url = quote_plus(url)
     path = get_cache_filename(args, url)
     os.makedirs(os.path.dirname(path), exist_ok=True)
-    with open(path, 'w') as f:
+    with open(path, 'w', encoding="utf-8") as f:
         f.write(html)
 
 
@@ -380,7 +414,7 @@ def update(self, n=1):
     parser = argparse.ArgumentParser(description='Download language specific LDS General Conference MP3s, '
                                                  'creating playlists for each conference, speaker and topic.')
     parser.add_argument('-lang', help='Language version to download. '
-                                      'See https://www.lds.org/languages for full list.', default='eng')
+                                      'See https://www.churchofjesuschrist.org/languages for full list.', default='eng')
     parser.add_argument('-start', type=int, help='First year to download. '
                                                  'Note: not all historic sessions are available in all languages',
                         default=1971)
@@ -389,6 +423,7 @@ def update(self, n=1):
     parser.add_argument('-nocleanup', help='Leaves temporary files after process completion.', action="store_true")
     parser.add_argument('-verbose', help='Provides detailed activity logging instead of progress bars.',
                         action="store_true")
+    parser.add_argument('-nonumbers', help='Excludes generated session and talk numbers from file and directory names.', action="store_true")
 
     cli_args = parser.parse_args()