json2rss.py

"""
This script processes JSON files and generates an RSS feed.
"""

import json
import os
import urllib.parse
from datetime import datetime, timedelta
from xml.etree.ElementTree import Element, SubElement, ElementTree, parse, register_namespace
import re
import logging
import argparse
import yaml

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Register namespaces
namespaces = {
    'atom': 'http://www.w3.org/2005/Atom'
}

for prefix, uri in namespaces.items():
    register_namespace(prefix, uri)

def load_config(config_file='config.yaml'):
    """Load configuration from a YAML file."""
    try:
        with open(config_file, 'r', encoding='utf-8') as file:
            return yaml.safe_load(file)
    except FileNotFoundError:
        logging.error("Configuration file '%s' not found.", config_file)
        exit(1)

def get_config_value(config, key, default_value=None):
    """Get the configuration value from environment variables, config, or default."""
    return os.getenv(key.upper(), config.get(key, default_value))

def sanitize_for_xml(tag):
    """Sanitize string to be used as an XML tag."""
    return re.sub(r'[^a-zA-Z0-9]', '_', tag)

def escape_xml_chars(text):
    """Escape special characters for XML content."""
    return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')

def read_json_files(directory):
    """Read and load JSON files from the specified directory."""
    json_data = []
    for filename in os.listdir(directory):
        if filename.endswith('_rewritten.json'):
            filepath = os.path.join(directory, filename)
            try:
                with open(filepath, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    json_data.append(data)
            except json.JSONDecodeError as e:
                logging.error("Error decoding JSON from file %s: %s", filename, e)
    return json_data

def load_moderated_words(file_path):
    """Load a list of moderated words from a file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return [line.strip() for line in file if line.strip()]
    except FileNotFoundError:
        logging.error("Moderated words file '%s' not found.", file_path)
        return []

def replace_swear_words(text, moderated_words):
    """Replace swear words in the text with asterisks."""
    for word in moderated_words:
        text = re.sub(r'\b' + re.escape(word) + r'\b', '*' * len(word), text, flags=re.IGNORECASE)
    return text

def create_rss_channel(config):
    """Create the base RSS channel element with proper namespaces and configuration."""
    rss = Element('rss', version='2.0')
    rss.set('xmlns:atom', 'http://www.w3.org/2005/Atom')
    channel = SubElement(rss, 'channel')

    title = SubElement(channel, 'title')
    title.text = get_config_value(config, 'feed_title', 'UglyFeed News Feed')

    link = SubElement(channel, 'link')
    link.text = get_config_value(config, 'feed_link', 'https://github.com/fabriziosalmi/UglyFeed')

    description = SubElement(channel, 'description')
    description.text = get_config_value(config, 'feed_description', 'Generated by UglyFeed')

    language = SubElement(channel, 'language')
    language.text = get_config_value(config, 'feed_language', 'en')

    SubElement(channel, 'atom:link', {
        'href': get_config_value(config, 'feed_self_link',
            'https://raw.githubusercontent.com/fabriziosalmi/UglyFeed/main/examples/uglyfeed-source-1.xml'),
        'rel': 'self',
        'type': 'application/rss+xml'
    })

    # Optional metadata
    if 'author' in config:
        author = SubElement(channel, 'author')
        author.text = get_config_value(config, 'author')

    if 'category' in config:
        category = SubElement(channel, 'category')
        category.text = get_config_value(config, 'category')

    if 'copyright' in config:
        copy_right = SubElement(channel, 'copyright')
        copy_right.text = get_config_value(config, 'copyright')

    return rss, channel

def process_item(item, config, moderated_words):
    """Process individual JSON item to XML item element."""
    item_element = Element('item')

    moderation_config = config.get('moderation', {})
    moderation_enabled = moderation_config.get('enabled', False)
    allow_duplicates = moderation_config.get('allow_duplicates', True)

    item_title = SubElement(item_element, 'title')
    title_text = item.get('title', 'No Title')
    item_title.text = escape_xml_chars(
        replace_swear_words(title_text, moderated_words) if moderation_enabled else title_text
    )

    item_description = SubElement(item_element, 'description')
    content = item.get('content', 'No Content')
    content = escape_xml_chars(
        replace_swear_words(content, moderated_words) if moderation_enabled else content
    )

    if 'links' in item:
        links = item['links']
        if not allow_duplicates:
            links = list(dict.fromkeys(item['links']))  # Remove duplicate links

        content += "<br/><br/><small><b>Sources</b></small><br/><ul>"
        for link in links:
            content += f'<li><small><a href="{link}" target="_blank">{link}</a></small></li>'
        content += "</ul>"

    api = item.get('api', 'Unknown API')
    model = item.get('model', 'Unknown Model')
    content += (
        f'<br/><br/><small>Generated by <b>{escape_xml_chars(model)}</b> via '
        f'<b>{escape_xml_chars(api.capitalize())}</b></small>'
    )

    item_description.text = content

    processed_at_str = item.get('processed_at', datetime.now().isoformat())
    if processed_at_str is None:
        processed_at_str = datetime.now().isoformat()
    try:
        processed_at = datetime.strptime(processed_at_str, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        processed_at = datetime.now()

    pub_date = SubElement(item_element, 'pubDate')
    pub_date.text = processed_at.strftime(
        get_config_value(config, 'datetime_format', '%a, %d %b %Y %H:%M:%S GMT')
    )

    guid = SubElement(item_element, 'guid')
    guid.text = f"https://github.com/fabriziosalmi/UglyFeed/{urllib.parse.quote(item.get('title', 'No Title'))}"

    return item_element

def create_rss_feed(json_data, output_path, config):
    """Create or update an RSS feed based on provided JSON data."""
    moderation_config = config.get('moderation', {})
    moderation_enabled = moderation_config.get('enabled', False)
    moderated_words_file = moderation_config.get('words_file', 'moderated.txt')
    moderated_words = load_moderated_words(moderated_words_file) if moderation_enabled else []

    if os.path.exists(output_path):
        try:
            tree = parse(output_path)
            rss = tree.getroot()
            channel = rss.find('channel')
            if channel is None:
                raise ValueError("Channel element not found in existing RSS file.")
        except (ValueError, Exception) as e:
            logging.error("Error parsing existing RSS file: %s", e)
            return
    else:
        rss, channel = create_rss_channel(config)

    new_items = []
    cutoff_date = datetime.now() - timedelta(days=int(get_config_value(config, 'max_age_days', 30)))
    for item in json_data:
        item_element = process_item(item, config, moderated_words)
        pub_date_element = item_element.find('pubDate')
        if pub_date_element is not None:
            processed_at = datetime.strptime(
                pub_date_element.text,
                get_config_value(config, 'datetime_format', '%a, %d %b %Y %H:%M:%S GMT')
            )
        else:
            processed_at = datetime.now()

        if processed_at >= cutoff_date:
            new_items.append(item_element)

    existing_items = list(channel.findall('item')) if channel is not None else []
    all_items = existing_items + new_items
    all_items.sort(
        key=lambda x: datetime.strptime(
            x.find('pubDate').text, 
            get_config_value(config, 'datetime_format', '%a, %d %b %Y %H:%M:%S GMT')
        ), 
        reverse=True
    )

    max_items = int(get_config_value(config, 'max_items', 50))
    trimmed_items = all_items[:max_items]

    if channel is not None:
        for item in channel.findall('item'):
            channel.remove(item)
        for item in trimmed_items:
            channel.append(item)

    try:
        tree = ElementTree(rss)
        tree.write(output_path, encoding='utf-8', xml_declaration=True)
        item_count = len(trimmed_items)
        logging.info("RSS feed successfully updated at %s", output_path)
        logging.info("Total items in feed: %d", item_count)
        print(f"RSS feed successfully generated at {output_path}")
        print(f"Total items in feed: {item_count}")
    except IOError as e:
        logging.error("Error saving RSS feed to file %s: %s", output_path, e)

def main():
    """Main function to read JSON files and create/update the RSS feed."""
    parser = argparse.ArgumentParser(description='Process JSON to RSS.')
    parser.add_argument('--config', type=str, default='config.yaml', help='Path to the configuration file')
    parser.add_argument('--max_items', type=int, help='Override maximum number of items in the RSS feed')
    parser.add_argument('--max_age_days', type=int, help='Override maximum age of items in days')
    parser.add_argument('--feed_title', type=str, help='Override feed title')
    parser.add_argument('--feed_link', type=str, help='Override feed link')
    parser.add_argument('--feed_description', type=str, help='Override feed description')
    parser.add_argument('--feed_language', type=str, help='Override feed language')
    parser.add_argument('--feed_self_link', type=str, help='Override feed self link')
    parser.add_argument('--author', type=str, help='Override author')
    parser.add_argument('--category', type=str, help='Override category')
    parser.add_argument('--copyright', type=str, help='Override copyright')
    parser.add_argument('--moderation_enabled', type=bool, help='Enable or disable moderation')
    parser.add_argument('--moderated_words_file', type=str, help='Path to the moderated words file')
    parser.add_argument('--allow_duplicates', type=bool, help='Allow or disallow duplicate links')
    parser.add_argument('--rewritten_dir', type=str, help='Directory for rewritten JSON files')
    parser.add_argument('--output_dir', type=str, help='Output directory for RSS feed')
    args = parser.parse_args()

    config = load_config(args.config)

    # Override config values with environment variables if present
    config.update({k: get_config_value(config, k, v) for k, v in config.items()})
    # Override nested moderation config with environment variables
    if 'moderation' in config:
        config['moderation'].update({k: get_config_value(config['moderation'], k, v) for k, v in config['moderation'].items()})

    # Override config values if command-line arguments are provided
    for key, value in vars(args).items():
        if value is not None:
            if key.startswith('moderation_'):
                moderation_key = key.split('_', 1)[1]
                config.setdefault('moderation', {})[moderation_key] = value
            else:
                config[key] = value

    logging.debug("Configuration: %s", json.dumps(config, indent=4))

    rewritten_dir = config.get('rewritten_dir', 'rewritten')
    output_path = os.path.join(config.get('output_dir', 'uglyfeeds'), 'uglyfeed.xml')

    os.makedirs(config.get('output_dir', 'uglyfeeds'), exist_ok=True)

    json_data = read_json_files(rewritten_dir)

    if json_data:
        create_rss_feed(json_data, output_path, config)
    else:
        logging.info('No JSON files found in the rewritten directory.')

if __name__ == '__main__':
    main()